#Introduction

Libraries

library(tidyverse)
library(readr)          # import
library(rpart)          # regression trees
library(rpart.plot)     # regression tree plots
library(summarytools)   # summary statistics
library(party)          # ctree
library(partykit)       # ctree
library(caret)
library(forecast)
library(ineq)           # Gini
library(precrec)        # ROC curves
library(corrplot)       # Correlation plots
library(plotly)         # interactive ggplot2 plots :D
library(knitcitations); cleanbib()
cite_options(citation_format = "pandoc", check.entries=FALSE)
library(bibtex)
library(readr)

Data import

TEXT

# setting the data path
data_path ="./AT2019"

# accessing the data
data19 <- read.csv(file.path(data_path, "p_silc2019_ext.csv"), sep = ";")           # personal data

Data Wrangling

First, we will select and rename the variables of interest. The we will log net income and recode some of the variables.

data19 <- data19 %>%
  
  select(sex, P038004, P110000nu, P111010nu, alter, M009010, M010000, M014000, M016000, M017000, M020010, M021000, M025000, M027000, M028000, M004000, M001300, M001510, M003100, M001100, M001200, M002000, M001500) %>% 
  
  rename("inc_net"           = P038004,     # gross monthly income
         "country_birth"     = P110000nu,   # country of birth of respondent
         "citizenship"       = P111010nu,   # citizenship of respondent
         "age"               = alter,       # age of respondent
         "father_cit"        = M009010,     # citizenship of father at age 14
         "father_edu"        = M010000,     # education of father at age 14 (höchster abschluss)
         "father_occup_stat" = M014000,     # occupational status of father at age 14
         "father_occup"      = M016000,     # main occupation of father at age 14
         "father_manag"      = M017000,     # managerial position of father at age 14            
         "mother_cit"        = M020010,     # citizenship of mother at age 14
         "mother_edu"        = M021000,     # education of mother at age 14
         "mother_occup_stat" = M025000,     # occupational status of mother at age 14
         "mother_occup"      = M027000,     # main occupation of mother at age 14
         "mother_manag"      = M028000,     # managerial position of mother at age 14
         "tenancy"           = M004000,     # tenancy at age 14
         "children"          = M001300,     # number of children (under 18) in respondent’s household at age 14
         "adults"            = M001510,     # number of adults (aged 18 or more) in respondent’s household
         "adults_working"    = M003100,     # number of working adults (aged 18 or more) in respondent’s hhd.
         "father_present"    = M001100,     # father present in respondent’s household at age 14
         "mother_present"    = M001200,     # mother present in respondent’s household at age 14
         "adults_present"    = M001500,     # adults present in respondent’s household at age 14
         ) %>%    
  
  filter(age %in% (27:59), inc_net > 0, mother_present > 0, father_present > 0, father_cit > 0, mother_cit > 0)  %>%
  # We drop all answers where the respondents refused or were not able to provide information 
  # D: dropped man dann nicht die ganzen observations??
  # D und wwenn wir nach age filtern (und nicht die observations mit -6 entfernen) müssen wir erklären warum 27-59 und nicht 25-59 und den text unten ändern, weil ich hab ja einfach die mit -6 (age outside of range) entfernt.
  
  mutate("inc_net_log" = log(inc_net),   
         # logged net income per month of respondent
         
         "both_parents_present" = father_present + mother_present,            
         # 4 = none present, 3 = one present, 2 = both present
         
         sex = factor(ifelse(as.numeric(sex)==2, 1, 0)), 
         # 0 = male, 1 = female
         
         country_birth = factor(country_birth, labels = c(1, 2, 2, 2, 3, 3)), 
         # Austria, EU, Non-EU
         
         father_cit = ifelse(father_cit == 1, 1, 2),                          
         # Austria and Other
         
         mother_cit = ifelse(mother_cit == 1, 1, 2))   
         # Austria and Other

Data Exploration and Visualization

To get a good grasp of our data we will first look at some simple descriptives and a correlaton plot The ad-hoc module on intergenerational transmission of disadvantages only includes “selected respondents aged over 24 years and less than 60 years”. This is why we exclude them: coding ‘-6’ means that year of birth is outside of 1969 and 1994 range or the interview was a proxy interview. Additionally, we exclude all respondents that where not part of this ad-hoc modul even if of the desired age.

Data Summary

print(dfSummary(data19), method="render", style="grid", plain.ascii = F)

Data Frame Summary

data19

Dimensions: 3600 x 25
Duplicates: 1
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 sex [factor] 1. 0 2. 1
1729(48.0%)
1871(52.0%)
3600 (100.0%) 0 (0.0%)
2 inc_net [integer] Mean (sd) : 1979.5 (972) min < med < max: 60 < 1900 < 13000 IQR (CV) : 1010 (0.5) 621 distinct values 3600 (100.0%) 0 (0.0%)
3 country_birth [factor] 1. 1 2. 2 3. 3
2991(83.1%)
425(11.8%)
184(5.1%)
3600 (100.0%) 0 (0.0%)
4 citizenship [integer] Mean (sd) : 1.3 (0.9) min < med < max: 1 < 1 < 6 IQR (CV) : 0 (0.7)
1:3166(87.9%)
2:129(3.6%)
3:150(4.2%)
4:53(1.5%)
5:36(1.0%)
6:66(1.8%)
3600 (100.0%) 0 (0.0%)
5 age [integer] Mean (sd) : 43.5 (9.1) min < med < max: 27 < 44 < 58 IQR (CV) : 15 (0.2) 32 distinct values 3600 (100.0%) 0 (0.0%)
6 father_cit [numeric] Min : 1 Mean : 1.2 Max : 2
1:2979(82.8%)
2:621(17.2%)
3600 (100.0%) 0 (0.0%)
7 father_edu [integer] Mean (sd) : 3 (2.7) min < med < max: -2 < 2 < 9 IQR (CV) : 2 (0.9) 11 distinct values 3600 (100.0%) 0 (0.0%)
8 father_occup_stat [integer] Mean (sd) : 1 (1.3) min < med < max: -5 < 1 < 4 IQR (CV) : 0 (1.3)
-5:124(3.4%)
-2:24(0.7%)
1:2785(77.4%)
2:595(16.5%)
3:17(0.5%)
4:55(1.5%)
3600 (100.0%) 0 (0.0%)
9 father_occup [integer] Mean (sd) : 4.8 (3.1) min < med < max: -5 < 5 < 9 IQR (CV) : 4 (0.6) 13 distinct values 3598 (99.9%) 2 (0.1%)
10 father_manag [integer] Mean (sd) : 1.2 (1.5) min < med < max: -5 < 2 < 2 IQR (CV) : 1 (1.2)
-5:124(3.4%)
-3:55(1.5%)
-2:81(2.2%)
1:1286(35.7%)
2:2054(57.1%)
3600 (100.0%) 0 (0.0%)
11 mother_cit [numeric] Min : 1 Mean : 1.2 Max : 2
1:2998(83.3%)
2:602(16.7%)
3600 (100.0%) 0 (0.0%)
12 mother_edu [integer] Mean (sd) : 2.7 (2.5) min < med < max: -2 < 2 < 9 IQR (CV) : 3 (0.9) 11 distinct values 3600 (100.0%) 0 (0.0%)
13 mother_occup_stat [integer] Mean (sd) : 2.1 (1.5) min < med < max: -5 < 1 < 4 IQR (CV) : 3 (0.7)
-5:29(0.8%)
-2:10(0.3%)
1:1931(53.6%)
2:272(7.6%)
3:209(5.8%)
4:1149(31.9%)
3600 (100.0%) 0 (0.0%)
14 mother_occup [integer] Mean (sd) : 2.6 (4.3) min < med < max: -5 < 4 < 9 IQR (CV) : 8 (1.7) 13 distinct values 3599 (100.0%) 1 (0.0%)
15 mother_manag [integer] Mean (sd) : 0.1 (2.3) min < med < max: -5 < 2 < 2 IQR (CV) : 5 (22.1)
-5:29(0.8%)
-3:1149(31.9%)
-2:137(3.8%)
1:322(8.9%)
2:1963(54.5%)
3600 (100.0%) 0 (0.0%)
16 tenancy [integer] Mean (sd) : 1.7 (0.6) min < med < max: -3 < 2 < 3 IQR (CV) : 1 (0.4)
-3:21(0.6%)
-2:5(0.1%)
1:968(26.9%)
2:2555(71.0%)
3:51(1.4%)
3600 (100.0%) 0 (0.0%)
17 children [integer] Mean (sd) : 1.1 (0.5) min < med < max: -3 < 1 < 2 IQR (CV) : 0 (0.4)
-3:22(0.6%)
-2:1(0.0%)
1:2964(82.3%)
2:613(17.0%)
3600 (100.0%) 0 (0.0%)
18 adults [integer] Mean (sd) : -2.1 (1.8) min < med < max: -3 < -3 < 7 IQR (CV) : 0 (-0.8)
-3:2903(80.6%)
-2:1(0.0%)
1:410(11.4%)
2:248(6.9%)
3:20(0.6%)
4:13(0.4%)
5:2(0.1%)
6:2(0.1%)
7:1(0.0%)
3600 (100.0%) 0 (0.0%)
19 adults_working [integer] Mean (sd) : 1.8 (1.1) min < med < max: -3 < 2 < 14 IQR (CV) : 1 (0.6) 15 distinct values 3600 (100.0%) 0 (0.0%)
20 father_present [integer] Min : 1 Mean : 1.1 Max : 2
1:3119(86.6%)
2:481(13.4%)
3600 (100.0%) 0 (0.0%)
21 mother_present [integer] Min : 1 Mean : 1 Max : 2
1:3464(96.2%)
2:136(3.8%)
3600 (100.0%) 0 (0.0%)
22 M002000 [integer] Mean (sd) : -2.9 (0.9) min < med < max: -3 < -3 < 8 IQR (CV) : 0 (-0.3)
-3:3526(97.9%)
-2:1(0.0%)
1:32(0.9%)
2:12(0.3%)
3:5(0.1%)
4:2(0.1%)
5:1(0.0%)
6:8(0.2%)
7:11(0.3%)
8:2(0.1%)
3600 (100.0%) 0 (0.0%)
23 adults_present [integer] Mean (sd) : 1.7 (0.8) min < med < max: -3 < 2 < 2 IQR (CV) : 0 (0.5)
-3:22(0.6%)
-2:105(2.9%)
1:697(19.4%)
2:2776(77.1%)
3600 (100.0%) 0 (0.0%)
24 inc_net_log [numeric] Mean (sd) : 7.5 (0.5) min < med < max: 4.1 < 7.5 < 9.5 IQR (CV) : 0.5 (0.1) 621 distinct values 3600 (100.0%) 0 (0.0%)
25 both_parents_present [integer] Mean (sd) : 2.2 (0.4) min < med < max: 2 < 2 < 4 IQR (CV) : 0 (0.2)
2:3057(84.9%)
3:469(13.0%)
4:74(2.1%)
3600 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.8 (R version 4.0.3)
2021-02-16

The data set includes 25 variables and 3600 observation. There are almost no missing values. There are slight more females (52%) than males (48%). The income distribution is skewed to the right meaning the median income is lower than the mean income. 83% of respondents where born in Austria and 88& are Austrian citizens. In about 85% of respondents household at age 14 both parents where present and in 96% the mother was present….

Gini Index and Lorenz curve

ineq(data19$inc_net, type = "Gini")
## [1] 0.2543448

The Gini index is 0.25 which is a bit lower than the World Bank estimate for Austria of 0.3 (2017) available at https://data.worldbank.org/indicator/SI.POV.GINI?locations=AT.

plot(Lc(data19$inc_net), col = "darkred", lwd = 3)

The Gini index corresponds to the are below the the black equal distribution line and above the red line of the actual distribution.

Age pyramid

agepyra <- ggplot(data19, aes(x = age, fill=sex))  + 
  geom_bar(data = subset(data19, sex==1)) +
  geom_bar(data = subset(data19, sex==0), aes(y=..count..*(-1))) + 
  scale_x_continuous(breaks = seq(27,59,2), labels=abs(seq(27,59,2))) +
  scale_fill_manual(name = "Sex", labels = c("Male", "Female"), values=c("springgreen2", "slateblue1")) +
  labs(title = "Age pyramide of ad-hoc module on intergenerational transmission of disadvantages", x = "Age", y = "Number of people") +
  theme_bw() +
  coord_flip()

ggplotly(agepyra)

In the Data Frame Summary above we already saw that there are slightly more females (1) than males (0) in the data set and that the median age is 44 - while the age distribution of the sample is quite evenly distribution there are a bit more older than young individuals.

median(data19$age)
## [1] 44

Correlation plot

data19cor <- data19
data19cor$sex <- as.numeric(data19cor$sex)
data19cor$country_birth <- as.numeric(data19cor$country_birth)

# Dropping the categorical variables father_occup & mother_occup
data19cor <- select(data19cor, -c(father_occup, mother_occup))

# Computing correlation coefficients and significance thereof 
data19cor <- cor(data19cor)
res1 <- cor.mtest(data19cor, conf.level = 0.99)

corrplot(data19cor, method = "ellipse", type = "upper", order = "FPC", diag = FALSE, outline = FALSE, tl.cex = .5, tl.col = "black", title = "Correlation plot", p.mat = res1$p, sig.level = 0.01, insig = "blank", mar=c(2,2,2,2))

As can be seen from the correlation plot, all variables are significantly related to at least one other variable of the data set (at the 1% significance level). For better visibility insignificant correlations are blanked out. As the correlation matrix is ordered using the first principal component there is some clustering of significant correlations.

Method

Regression Tree

set.seed(123)

formula = inc_net_log ~ sex + country_birth + father_cit + father_edu + father_occup_stat + father_occup + father_manag + mother_cit + mother_edu + mother_occup_stat + mother_occup + mother_manag + tenancy + children + adults + adults_working + both_parents_present

data19 <- data19 %>%
  mutate(train_index = sample(c("train", "test"), nrow(data19), replace=TRUE, prob=c(0.80, 0.20)))

train <- data19 %>% filter(train_index=="train")
test <- data19 %>% filter(train_index=="test")
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10, savePredictions = T)

tuning_grid <- expand.grid(cp = seq(0, 0.02, by= 0.005))
tuning_grid
##      cp
## 1 0.000
## 2 0.005
## 3 0.010
## 4 0.015
## 5 0.020
caret_rpart <- train(formula, data = data19, method = "rpart", trControl = fitControl, tuneGrid = tuning_grid, metric = "RMSE", na.action = na.pass)

caret_rpart
## CART 
## 
## 3600 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 3241, 3240, 3239, 3240, 3239, 3240, ... 
## Resampling results across tuning parameters:
## 
##   cp     RMSE       Rsquared   MAE      
##   0.000  0.5160590  0.1197761  0.3786742
##   0.005  0.4789493  0.1802058  0.3461936
##   0.010  0.4833582  0.1649875  0.3497187
##   0.015  0.4853702  0.1579280  0.3515222
##   0.020  0.4852788  0.1582175  0.3514775
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.005.
tree_caret_final <- caret_rpart$finalModel
rpart.plot(tree_caret_final, box.palette="RdBu", nn=FALSE, type=2)

Conditional inference tree

# For the Inference Tree to work, we must have all variables as numeric data

Ctree <- ctree(formula, data = train, control = ctree_control(testtype = "Bonferroni")) #I think that this already include the Control for inference trees, there is a possibility to do it with CV and caret, but it did not work out yet
Ctree
## 
## Model formula:
## inc_net_log ~ sex + country_birth + father_cit + father_edu + 
##     father_occup_stat + father_occup + father_manag + mother_cit + 
##     mother_edu + mother_occup_stat + mother_occup + mother_manag + 
##     tenancy + children + adults + adults_working + both_parents_present
## 
## Fitted party:
## [1] root
## |   [2] sex in 0
## |   |   [3] country_birth in 1, 2
## |   |   |   [4] mother_cit <= 1
## |   |   |   |   [5] father_edu <= 2
## |   |   |   |   |   [6] mother_edu <= -2: 7.288 (n = 9, err = 6.5)
## |   |   |   |   |   [7] mother_edu > -2: 7.696 (n = 777, err = 100.1)
## |   |   |   |   [8] father_edu > 2: 7.780 (n = 365, err = 73.9)
## |   |   |   [9] mother_cit > 1
## |   |   |   |   [10] father_edu <= 6
## |   |   |   |   |   [11] adults_working <= 0: 7.092 (n = 7, err = 1.7)
## |   |   |   |   |   [12] adults_working > 0: 7.540 (n = 116, err = 9.9)
## |   |   |   |   [13] father_edu > 6: 7.754 (n = 46, err = 12.8)
## |   |   [14] country_birth in 3: 7.313 (n = 71, err = 26.7)
## |   [15] sex in 1
## |   |   [16] mother_edu <= 2
## |   |   |   [17] father_cit <= 1: 7.259 (n = 897, err = 239.5)
## |   |   |   [18] father_cit > 1: 7.089 (n = 191, err = 58.9)
## |   |   [19] mother_edu > 2: 7.376 (n = 398, err = 132.1)
## 
## Number of inner nodes:     9
## Number of terminal nodes: 10
plot(Ctree, type = "simple",gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(id = FALSE), main = "Conditional Inference Tree for Austria 2019") #Überschrift größe ändern

Instead of the default tuning function we use the caret package for cross validation.

### data = in data 2019 geändert von train!
caret_ctree <- train(formula, data = data19, method = "ctree", trControl = fitControl, na.action = na.pass)
caret_ctree
## Conditional Inference Tree 
## 
## 3600 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 3240, 3241, 3240, 3240, 3240, 3240, ... 
## Resampling results across tuning parameters:
## 
##   mincriterion  RMSE       Rsquared   MAE      
##   0.01          0.4923692  0.1519835  0.3570260
##   0.50          0.4799440  0.1780711  0.3453309
##   0.99          0.4783652  0.1817853  0.3442732
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
caret_ctree_B <- ctree(formula, data = data19, control = ctree_control(testtype = "Bonferroni", mincriterion = 0.99)) 
caret_ctree_B
## 
## Model formula:
## inc_net_log ~ sex + country_birth + father_cit + father_edu + 
##     father_occup_stat + father_occup + father_manag + mother_cit + 
##     mother_edu + mother_occup_stat + mother_occup + mother_manag + 
##     tenancy + children + adults + adults_working + both_parents_present
## 
## Fitted party:
## [1] root
## |   [2] sex in 0
## |   |   [3] country_birth in 1, 2
## |   |   |   [4] mother_cit <= 1
## |   |   |   |   [5] mother_occup_stat <= -2: 7.306 (n = 15, err = 7.1)
## |   |   |   |   [6] mother_occup_stat > -2: 7.725 (n = 1416, err = 209.5)
## |   |   |   [7] mother_cit > 1
## |   |   |   |   [8] father_edu <= 6: 7.518 (n = 152, err = 21.4)
## |   |   |   |   [9] father_edu > 6: 7.820 (n = 54, err = 15.7)
## |   |   [10] country_birth in 3: 7.326 (n = 92, err = 32.6)
## |   [11] sex in 1
## |   |   [12] father_edu <= 2
## |   |   |   [13] father_cit <= 1: 7.253 (n = 1055, err = 290.7)
## |   |   |   [14] father_cit > 1: 7.036 (n = 204, err = 64.0)
## |   |   [15] father_edu > 2: 7.372 (n = 612, err = 169.4)
## 
## Number of inner nodes:    7
## Number of terminal nodes: 8
plot(caret_ctree_B,gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Austria 2019 - Cross Validated")

# Das ist der gleiche Baum wie oben, nur mit anderem Package erstellt. Sieht aber scheiße aus
plot(caret_ctree,gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(id = FALSE))

plot(caret_ctree$finalModel, type = "simple")

Graphic representation of the tuning parameters

plot(caret_ctree) # RMSE vs p-value our resampling parameter

plot(caret_rpart)

# plotcp(tree_1)

Predictions

test$P_AtCt <- predict(Ctree, newdata = as.data.frame(test))
test$perror <- (test$P_AtCt - test$inc_net_log)^2
test$RMSE <- sqrt(sum((test$P_AtCt - test$inc_net_log)^2/nrow(test), na.rm = T))
head(test$RMSE)
## [1] 0.459181 0.459181 0.459181 0.459181 0.459181 0.459181
plot(test$P_AtCt, test$inc_net_log) #ADD GGPLOT und machs schön!

test$P_AtCt_caret <- predict(caret_rpart, newdata = as.data.frame(test))
test$perror_caret <- (test$P_AtCt_caret - test$inc_net_log)^2
test$RMSE_caret <- sqrt(sum((test$P_AtCt_caret - test$inc_net_log)^2/nrow(test), na.rm = T))
head(test$RMSE_caret)
## [1] 0.4566333 0.4566333 0.4566333 0.4566333 0.4566333 0.4566333

Random Forest

cf <- cforest(formula, data19, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))

hat_cf <- predict(cf, newdata = test, OOB = TRUE, type = "response")

# Calculate the RMSE by hand for cforest and boosted ctree to compare 

varimp(cf, mincriterion = 0, OOB = TRUE) 
##                  sex        country_birth           father_cit 
##         0.0677898397         0.0059506319         0.0056707554 
##           father_edu         father_occup           mother_cit 
##         0.0054289985         0.0028044436         0.0065297354 
##           mother_edu    mother_occup_stat         mother_occup 
##         0.0040748970         0.0009257607         0.0004644832 
##         mother_manag              tenancy             children 
##         0.0017892779        -0.0001953003         0.0004404371 
##               adults       adults_working both_parents_present 
##        -0.0006739003         0.0003050525         0.0005388467
importance_cf <- data.frame(varimp(cf, mincriterion = 0, OOB = TRUE))
names(importance_cf) <- "importance"
importance_cf$var_name = rownames(importance_cf)
importance_cf <- importance_cf  %>% arrange( desc(importance))
varimpo <- ggplot(importance_cf, aes(x = var_name, y = importance)) +
    geom_pointrange(shape = 21, colour = "black", fill = "white", size = 3, stroke = 1, aes(ymin = 0, ymax = importance)) +
    scale_x_discrete(limits = importance_cf$var_name[order(importance_cf$importance)]) +
    labs(title = "Conditional Forest variable importance - Austria 2019", x = "", y = "Mean decrease in sum of squared residuals") +
    coord_flip() +
    theme_light() +
    theme(axis.line = element_blank(), panel.border = element_blank(), panel.grid.major.y=element_blank())

ggplotly(varimpo)

We find that the variable sex, is the single most important variable in determining ones income in Austria. However, sex is not a generationally transmittable circumstance and, while it is a circumstance it is not exactly what we were trying to answer with our exercise. Therefore, we exclude it in the next step and create a new conditional inference forest.

Boosted Inference Tree

#cf_boosted <- blackboost(formula, data = data19, na.action = na.pass, control = boost_control(), tree_controls = partykit::ctree_control())
# cf_boosted

cf_boosted_train <- train(formula, data19, method = "ctree2", trControl = fitControl, tuneGrid = NULL, na.action = na.pass)

#RMSE

test$At_BT_CF_pred <- predict(cf_boosted_train, newdata = as.data.frame(test))
test$perror <- (test$At_BT_CF_pred - test$inc_net_log)^2
test$RMSE <- sqrt(sum((test$At_BT_CF_pred - test$inc_net_log)^2/nrow(test), na.rm = T))
head(test$RMSE)
## [1] 0.4569734 0.4569734 0.4569734 0.4569734 0.4569734 0.4569734
plot(test$At_BT_CF_pred, test$inc_net_log) #ADD GGPLOT und machs schön!

## Variable Importance

###?????### geht nicht bei boosted tree mit caret package... 

varimp(cf_boosted_train, mincriterion = 0, OOB = TRUE) 
## Error in UseMethod("varimp"): nicht anwendbare Methode für 'varimp' auf Objekt der Klasse "c('train', 'train.formula')" angewendet
importance_cf_boosted <- data.frame(varimp(cf_boosted_train, mincriterion = 0, OOB = TRUE))
## Error in UseMethod("varimp"): nicht anwendbare Methode für 'varimp' auf Objekt der Klasse "c('train', 'train.formula')" angewendet
names(importance_cf_boosted) <- "importance"
## Error in names(importance_cf_boosted) <- "importance": Objekt 'importance_cf_boosted' nicht gefunden
importance_cf_boosted$var_name = rownames(importance_cf_boosted)
## Error in rownames(importance_cf_boosted): Objekt 'importance_cf_boosted' nicht gefunden
importance_cf_boosted <- importance_cf_boosted  %>% arrange( desc(importance))
## Error in arrange(., desc(importance)): Objekt 'importance_cf_boosted' nicht gefunden

Cross Country Comparison

*Introduction** In this part of the seminar paper, we attempt to reproduce the findings of [@brunori20], but unfortunately we do not have access to the actual EU-SILC data from 2011. Instead we reproduce the findings using the synthetic data provided by the European office of statistics (Eurostat) (https://ec.europa.eu/eurostat/web/microdata/statistics-on-income-and-living-conditions).

Data Wrangling

The original data is not provided as the EU protects the privacy of the original respondents. The idea of the public microdata, is that it allows us to train and write the code using the actual variable names, but not obtaining true results. The EU-SILC public microdata files are fully synthetic and they were simulated using statistical modeling and show the statistical distributions of the original data. The main caveats of this data are, that it cannot be used for statistical inference to the wider population. The results and conclusion obtained from this public microdata are to be taken with a big grain of salt. Luckily, the individual country datasets are grouped in a coherent manner. We use the EU-SILC data from 2011 as it was the survey when additionally there were questions on inter-generational transmission. These were questions about the parents of the respondents. We want to see, whether it is possible using only circumstantial information given about the parents and respondents to predict the income of the respondents.

The unique identifier used in all four data sets is the household ID identifier: RX030 in the Personal Register, PX030 in the Personal Data, DB030 in the Household Register, and HB030 in the Household Data file. We only need to combine two of the datasets, namely the Household Register and the Personal Data. Latter contains the Ad-hoc module with the questions on intergenerational characteristics.

Following [@brunori20] we use the following variables for circumstances: Respondent’s sex (PB150), Respondent’s country of birth (Citizenship as proxy - PB220A), Presence of parents at home (PT010), Number of adults (18 or older) in respondents household (PT020), Number of working adults (18 or older) in respondents household (PT030), Father/Mother country of birth and citizenship (PT060, PT070, PT090, PT100), Father/mother education (PT110, PT120), Father/mother occupational status (PT130, PT160), Father/mother main occupation (PT150,PT180), Managerial position of father/mother(PT140,PT170), Tenancy status of the house in which respondent was living as a child (PT210).

Outcome Variables i.e. Income: Total Household gross income (HY010), Total Disposable Income (HY020), Dwelling Type (HH010), Housing (HH030).

We first use more variables than ultimately used in the analysis. We use the year of birth to calculate the age, and then exclude everyone older than 60 or younger than 27, as was done in the paper we are referring to. We first included both monthly and annual gross income. But in this cross-country analysis we use annual gross income as our outcome variable.

At first we ran the analysis with the citizenship variable included, but we ultimately decided that it is not really a circumstantial variable as Respondents country of birth would have been. Since it is utltimately possible to obtain a new citizenship.

# setting the data path
data_path ="./SILC_2011"
getwd()
## [1] "C:/Users/leofi/Desktop/2187-2087_WS2020_Data_Science-Machine_learning/01_Data Science Seminar Paper"
# accessing the data
AT_personal_data <- read.csv(file.path(data_path, "AT_2011p_EUSILC.csv"))
AT_household_data <- read.csv(file.path(data_path, "AT_2011h_EUSILC.csv"))

# change the name of the identifier variable
AT_household_data <- AT_household_data %>% rename("PX030" = HB030)

# joining the data
AT_equality_data <- AT_personal_data %>%  left_join(AT_household_data, by = "PX030")

# Renaming important variables for readability of tree
AT_equality_data <- AT_equality_data %>% select(
  PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
    age = (2011 - PB140), log_income = log(HY010 + 1)
  ) %>% filter(
    age %in% (27:59)
  ) %>% mutate(
    citizenship = factor(PB220A, labels = c(1,2,3))
  ) %>% 
  rename(
    "year_of_birth" = PB140,
    "annual_income" = HY010,
    "sex" = PB150,
    "parents_present" = PT010,
    "adults_home" = PT020,
    "children_home" = PT030,
    "father_cob" = PT060,
    "father_cit" = PT070,
    "mother_cob" = PT090,
    "mother_cit" = PT100,
    "father_edu" = PT110,
    "mother_edu" = PT120,
    "father_occup_stat" = PT130,
    "mother_occup_stat" = PT160,
    "father_occup" = PT150,
    "mother_occup" = PT180,
    "father_manag" = PT140,
    "mother_manag" = PT170,
    "tenancy" = PT210,
    "monthly_income" = PY200G)

Summary We provide the summary statistics for Austria, which we obtained using the ‘dfsummary’ from the package ‘summarytools’. Similar to the 2019 dataset the ‘AT_equality_data’ does contain almost 7000 observations and no missing entries in our outcome variable annual income. However, it does contain many missing values across the observed circumstances. We chose to not exclude those and deal with these missing entries using the ‘na.action = na.omit’ command when doing the statistical analysis.

print(dfSummary(AT_equality_data), method="render")

Data Frame Summary

AT_equality_data

Dimensions: 6741 x 24
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 year_of_birth [integer] Mean (sd) : 1966.9 (9.2) min < med < max: 1952 < 1966 < 1984 IQR (CV) : 16 (0) 33 distinct values 6741 (100.0%) 0 (0.0%)
2 annual_income [integer] Mean (sd) : 72909.7 (51328.2) min < med < max: 0 < 62595 < 653401 IQR (CV) : 57961 (0.7) 4077 distinct values 6741 (100.0%) 0 (0.0%)
3 sex [integer] Min : 1 Mean : 1.5 Max : 2
1:3320(49.3%)
2:3421(50.7%)
6741 (100.0%) 0 (0.0%)
4 PB220A [character] 1. AT 2. EU 3. Other
5781(85.8%)
352(5.2%)
608(9.0%)
6741 (100.0%) 0 (0.0%)
5 parents_present [integer] Mean (sd) : 1.3 (0.8) min < med < max: 1 < 1 < 5 IQR (CV) : 0 (0.6)
1:3586(86.5%)
2:75(1.8%)
3:385(9.3%)
4:68(1.6%)
5:31(0.7%)
4145 (61.5%) 2596 (38.5%)
6 adults_home [integer] Mean (sd) : 2.7 (1.3) min < med < max: 0 < 2 < 12 IQR (CV) : 1 (0.5) 13 distinct values 4176 (61.9%) 2565 (38.1%)
7 children_home [integer] Mean (sd) : 2.5 (1.6) min < med < max: 1 < 2 < 16 IQR (CV) : 2 (0.6) 14 distinct values 4164 (61.8%) 2577 (38.2%)
8 father_cob [integer] Mean (sd) : 1.3 (0.7) min < med < max: -1 < 1 < 3 IQR (CV) : 0 (0.5)
-1:10(0.2%)
1:3182(78.2%)
2:335(8.2%)
3:542(13.3%)
4069 (60.4%) 2672 (39.6%)
9 father_cit [integer] Mean (sd) : 1.3 (0.7) min < med < max: -1 < 1 < 3 IQR (CV) : 0 (0.5)
-1:8(0.2%)
1:3337(81.1%)
2:224(5.4%)
3:545(13.2%)
4114 (61.0%) 2627 (39.0%)
10 mother_cob [integer] Mean (sd) : 1.4 (0.7) min < med < max: -1 < 1 < 3 IQR (CV) : 0 (0.5)
-1:1(0.0%)
1:3212(77.0%)
2:395(9.5%)
3:564(13.5%)
4172 (61.9%) 2569 (38.1%)
11 mother_cit [integer] Mean (sd) : 1.3 (0.7) min < med < max: -1 < 1 < 3 IQR (CV) : 0 (0.5)
-1:5(0.1%)
1:3323(80.0%)
2:297(7.1%)
3:529(12.7%)
4154 (61.6%) 2587 (38.4%)
12 father_edu [integer] Mean (sd) : 1.7 (0.8) min < med < max: -1 < 2 < 3 IQR (CV) : 1 (0.5)
-1:129(3.1%)
0:24(0.6%)
1:1535(36.8%)
2:1856(44.5%)
3:623(15.0%)
4167 (61.8%) 2574 (38.2%)
13 mother_edu [integer] Mean (sd) : 1.4 (0.7) min < med < max: -1 < 1 < 3 IQR (CV) : 1 (0.5)
-1:60(1.4%)
0:89(2.2%)
1:2271(54.9%)
2:1532(37.0%)
3:187(4.5%)
4139 (61.4%) 2602 (38.6%)
14 father_occup_stat [integer] Mean (sd) : 1.3 (0.7) min < med < max: -1 < 1 < 6 IQR (CV) : 0 (0.5)
-1:31(0.8%)
1:2943(74.8%)
2:869(22.1%)
3:16(0.4%)
4:51(1.3%)
5:3(0.1%)
6:19(0.5%)
3932 (58.3%) 2809 (41.7%)
15 mother_occup_stat [integer] Mean (sd) : 3 (1.9) min < med < max: -1 < 2 < 6 IQR (CV) : 4 (0.6)
-1:18(0.4%)
1:1567(38.3%)
2:665(16.3%)
3:6(0.1%)
4:16(0.4%)
5:1795(43.9%)
6:25(0.6%)
4092 (60.7%) 2649 (39.3%)
16 father_occup [integer] Mean (sd) : 5.6 (2.3) min < med < max: -1 < 6 < 9 IQR (CV) : 3 (0.4) 11 distinct values 3813 (56.6%) 2928 (43.4%)
17 mother_occup [integer] Mean (sd) : 5.5 (2.1) min < med < max: -1 < 5 < 9 IQR (CV) : 1 (0.4) 11 distinct values 2272 (33.7%) 4469 (66.3%)
18 father_manag [integer] Mean (sd) : 1.6 (0.6) min < med < max: -1 < 2 < 2 IQR (CV) : 1 (0.4)
-1:72(1.9%)
1:1412(36.5%)
2:2383(61.6%)
3867 (57.4%) 2874 (42.6%)
19 mother_manag [integer] Mean (sd) : 1.8 (0.5) min < med < max: -1 < 2 < 2 IQR (CV) : 0 (0.3)
-1:25(1.1%)
1:391(17.6%)
2:1810(81.3%)
2226 (33.0%) 4515 (67.0%)
20 tenancy [integer] Mean (sd) : 1.5 (0.7) min < med < max: -1 < 1 < 3 IQR (CV) : 1 (0.5)
-1:7(0.2%)
1:2396(58.1%)
2:1205(29.2%)
3:513(12.4%)
4121 (61.1%) 2620 (38.9%)
21 monthly_income [integer] Mean (sd) : 624.1 (280.8) min < med < max: 1 < 851 < 851 IQR (CV) : 495 (0.5) 651 distinct values 6741 (100.0%) 0 (0.0%)
22 age [numeric] Mean (sd) : 44.1 (9.2) min < med < max: 27 < 45 < 59 IQR (CV) : 16 (0.2) 33 distinct values 6741 (100.0%) 0 (0.0%)
23 log_income [numeric] Mean (sd) : 10.9 (1.1) min < med < max: 0 < 11 < 13.4 IQR (CV) : 0.9 (0.1) 4077 distinct values 6741 (100.0%) 0 (0.0%)
24 citizenship [factor] 1. 1 2. 2 3. 3
5781(85.8%)
352(5.2%)
608(9.0%)
6741 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.8 (R version 4.0.3)
2021-02-16

# Here we repeat the Data Wrangling steps for other EU Member States
# France
FR_personal_data <- read.csv(file.path(data_path, "FR_2011p_EUSILC.csv"))
FR_household_data <- read.csv(file.path(data_path, "FR_2011h_EUSILC.csv"))
FR_household_data <- FR_household_data %>% rename("PX030" = HB030)
FR_equality_data <- FR_personal_data %>%  left_join(FR_household_data, by = "PX030")

FR_equality_data <- FR_equality_data %>% select(
  PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
    age = (2011 - PB140), log_income = log(HY010 + 1)
  ) %>% filter(
    age %in% (27:59)
  ) %>% mutate(
    citizenship = factor(PB220A, labels = c(1,2,3))
  ) %>% 
  rename(
    "year_of_birth" = PB140,
    "annual_income" = HY010,
    "sex" = PB150,
    "parents_present" = PT010,
    "adults_home" = PT020,
    "children_home" = PT030,
    "father_cob" = PT060,
    "father_cit" = PT070,
    "mother_cob" = PT090,
    "mother_cit" = PT100,
    "father_edu" = PT110,
    "mother_edu" = PT120,
    "father_occup_stat" = PT130,
    "mother_occup_stat" = PT160,
    "father_occup" = PT150,
    "mother_occup" = PT180,
    "father_manag" = PT140,
    "mother_manag" = PT170,
    "tenancy" = PT210,
    "monthly_income" = PY200G)
# Denmark
DK_personal_data <- read.csv(file.path(data_path, "DK_2011p_EUSILC.csv"))
DK_household_data <- read.csv(file.path(data_path, "DK_2011h_EUSILC.csv"))
DK_household_data <- DK_household_data %>% rename("PX030" = HB030)
DK_equality_data <- DK_personal_data %>%  left_join(DK_household_data, by = "PX030")

DK_equality_data <- DK_equality_data %>% select(
  PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
    age = (2011 - PB140), log_income = log(HY010 + 1)
  ) %>% filter(
    age %in% (27:59)
  ) %>% mutate(
    citizenship = factor(PB220A, labels = c(1,2,3))
  ) %>% 
  rename(
    "year_of_birth" = PB140,
    "annual_income" = HY010,
    "sex" = PB150,
    "parents_present" = PT010,
    "adults_home" = PT020,
    "children_home" = PT030,
    "father_cob" = PT060,
    "father_cit" = PT070,
    "mother_cob" = PT090,
    "mother_cit" = PT100,
    "father_edu" = PT110,
    "mother_edu" = PT120,
    "father_occup_stat" = PT130,
    "mother_occup_stat" = PT160,
    "father_occup" = PT150,
    "mother_occup" = PT180,
    "father_manag" = PT140,
    "mother_manag" = PT170,
    "tenancy" = PT210,
    "monthly_income" = PY200G)
# Spain
ES_personal_data <- read.csv(file.path(data_path, "ES_2011p_EUSILC.csv"))
ES_household_data <- read.csv(file.path(data_path, "ES_2011h_EUSILC.csv"))
ES_household_data <- ES_household_data %>% rename("PX030" = HB030)
ES_equality_data <- ES_personal_data %>%  left_join(ES_household_data, by = "PX030")

ES_equality_data <- ES_equality_data %>% select(
  PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
    age = (2011 - PB140), log_income = log(HY010 + 1)
  ) %>% filter(
    age %in% (27:59)
  ) %>% mutate(
    citizenship = factor(PB220A, labels = c(1,2,3))
  ) %>% 
  rename(
    "year_of_birth" = PB140,
    "annual_income" = HY010,
    "sex" = PB150,
    "parents_present" = PT010,
    "adults_home" = PT020,
    "children_home" = PT030,
    "father_cob" = PT060,
    "father_cit" = PT070,
    "mother_cob" = PT090,
    "mother_cit" = PT100,
    "father_edu" = PT110,
    "mother_edu" = PT120,
    "father_occup_stat" = PT130,
    "mother_occup_stat" = PT160,
    "father_occup" = PT150,
    "mother_occup" = PT180,
    "father_manag" = PT140,
    "mother_manag" = PT170,
    "tenancy" = PT210,
    "monthly_income" = PY200G)
# Finland
FI_personal_data <- read.csv(file.path(data_path, "FI_2011p_EUSILC.csv"))
FI_household_data <- read.csv(file.path(data_path, "FI_2011h_EUSILC.csv"))
FI_household_data <- FI_household_data %>% rename("PX030" = HB030)
FI_equality_data <- FI_personal_data %>%  left_join(FI_household_data, by = "PX030")

FI_equality_data <- FI_equality_data %>% select(
  PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
    age = (2011 - PB140), log_income = log(HY010 + 1)
  ) %>% filter(
    age %in% (27:59)
  ) %>% mutate(
    citizenship = factor(PB220A, labels = c(1,2,3))
  ) %>% 
  rename(
    "year_of_birth" = PB140,
    "annual_income" = HY010,
    "sex" = PB150,
    "parents_present" = PT010,
    "adults_home" = PT020,
    "children_home" = PT030,
    "father_cob" = PT060,
    "father_cit" = PT070,
    "mother_cob" = PT090,
    "mother_cit" = PT100,
    "father_edu" = PT110,
    "mother_edu" = PT120,
    "father_occup_stat" = PT130,
    "mother_occup_stat" = PT160,
    "father_occup" = PT150,
    "mother_occup" = PT180,
    "father_manag" = PT140,
    "mother_manag" = PT170,
    "tenancy" = PT210,
    "monthly_income" = PY200G)
# Italy
IT_personal_data <- read.csv(file.path(data_path, "IT_2011p_EUSILC.csv"))
IT_household_data <- read.csv(file.path(data_path, "IT_2011h_EUSILC.csv"))
IT_household_data <- IT_household_data %>% rename("PX030" = HB030)
IT_equality_data <- IT_personal_data %>%  left_join(IT_household_data, by = "PX030")

IT_equality_data <- IT_equality_data %>% select(
  PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
    age = (2011 - PB140), log_income = log(HY010 + 1)
  ) %>% filter(
    age %in% (27:59)
  ) %>% mutate(
    citizenship = factor(PB220A, labels = c(1,2,3))
  ) %>% 
  rename(
    "year_of_birth" = PB140,
    "annual_income" = HY010,
    "sex" = PB150,
    "parents_present" = PT010,
    "adults_home" = PT020,
    "children_home" = PT030,
    "father_cob" = PT060,
    "father_cit" = PT070,
    "mother_cob" = PT090,
    "mother_cit" = PT100,
    "father_edu" = PT110,
    "mother_edu" = PT120,
    "father_occup_stat" = PT130,
    "mother_occup_stat" = PT160,
    "father_occup" = PT150,
    "mother_occup" = PT180,
    "father_manag" = PT140,
    "mother_manag" = PT170,
    "tenancy" = PT210,
    "monthly_income" = PY200G)
# # Bulgaria
# BG_personal_data <- read.csv(file.path(data_path, "BG_2011p_EUSILC.csv"))
# BG_household_data <- read.csv(file.path(data_path, "BG_2011h_EUSILC.csv"))
# BG_household_data <- BG_household_data %>% rename("PX030" = HB030)
# BG_equality_data <- BG_personal_data %>%  left_join(BG_household_data, by = "PX030")
# 
# BG_equality_data <- BG_equality_data %>% select(
#   PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
#     age = (2011 - PB140), log_income = log(HY010 + 1)
#   ) %>% filter(
#     age %in% (27:59)
#   ) %>% mutate(
#     citizenship = factor(PB220A, labels = c(1,2,3))
#   ) %>% 
#   rename(
#     "year_of_birth" = PB140,
#     "annual_income" = HY010,
#     "sex" = PB150,
#     "parents_present" = PT010,
#     "adults_home" = PT020,
#     "children_home" = PT030,
#     "father_cob" = PT060,
#     "father_cit" = PT070,
#     "mother_cob" = PT090,
#     "mother_cit" = PT100,
#     "father_edu" = PT110,
#     "mother_edu" = PT120,
#     "father_occup_stat" = PT130,
#     "mother_occup_stat" = PT160,
#     "father_occup" = PT150,
#     "mother_occup" = PT180,
#     "father_manag" = PT140,
#     "mother_manag" = PT170,
#     "tenancy" = PT210,
#     "monthly_income" = PY200G)
# Latvia
LV_personal_data <- read.csv(file.path(data_path, "LV_2011p_EUSILC.csv"))
LV_household_data <- read.csv(file.path(data_path, "LV_2011h_EUSILC.csv"))
LV_household_data <- LV_household_data %>% rename("PX030" = HB030)
LV_equality_data <- LV_personal_data %>%  left_join(LV_household_data, by = "PX030")

LV_equality_data <- LV_equality_data %>% select(
  PB140, HY010, PB150, PB220A, PT010, PT020, PT030, PT060, PT070, PT090, PT100, PT110, PT120, PT130, PT160, PT150, PT180, PT140, PT170, PT210, PY200G) %>% mutate(
    age = (2011 - PB140), log_income = log(HY010 + 1)
  ) %>% filter(
    age %in% (27:59)
  ) %>% mutate(
    citizenship = factor(PB220A, labels = c(1,2,3))
  ) %>% 
  rename(
    "year_of_birth" = PB140,
    "annual_income" = HY010,
    "sex" = PB150,
    "parents_present" = PT010,
    "adults_home" = PT020,
    "children_home" = PT030,
    "father_cob" = PT060,
    "father_cit" = PT070,
    "mother_cob" = PT090,
    "mother_cit" = PT100,
    "father_edu" = PT110,
    "mother_edu" = PT120,
    "father_occup_stat" = PT130,
    "mother_occup_stat" = PT160,
    "father_occup" = PT150,
    "mother_occup" = PT180,
    "father_manag" = PT140,
    "mother_manag" = PT170,
    "tenancy" = PT210,
    "monthly_income" = PY200G)
print(dfSummary(FR_equality_data), method="render")

Data Frame Summary

FR_equality_data

Dimensions: 11013 x 24
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 year_of_birth [integer] Mean (sd) : 1967.9 (9.4) min < med < max: 1952 < 1968 < 1984 IQR (CV) : 16 (0) 33 distinct values 11013 (100.0%) 0 (0.0%)
2 annual_income [integer] Mean (sd) : 59673.4 (48605.8) min < med < max: 0 < 51111 < 1284688 IQR (CV) : 40114 (0.8) 6743 distinct values 11013 (100.0%) 0 (0.0%)
3 sex [integer] Min : 1 Mean : 1.5 Max : 2
1:5372(48.8%)
2:5641(51.2%)
11013 (100.0%) 0 (0.0%)
4 PB220A [character] 1. EU 2. FR 3. Other
223(2.0%)
10328(93.8%)
462(4.2%)
11013 (100.0%) 0 (0.0%)
5 parents_present [integer] Mean (sd) : 1.4 (0.9) min < med < max: 1 < 1 < 5 IQR (CV) : 0 (0.6)
1:5531(82.8%)
2:124(1.9%)
3:825(12.4%)
4:121(1.8%)
5:78(1.2%)
6679 (60.6%) 4334 (39.4%)
6 adults_home [integer] Mean (sd) : 2.4 (1.2) min < med < max: 0 < 2 < 20 IQR (CV) : 1 (0.5) 15 distinct values 6560 (59.6%) 4453 (40.4%)
7 children_home [integer] Mean (sd) : 1.6 (1.5) min < med < max: 0 < 1 < 13 IQR (CV) : 1 (1) 13 distinct values 6526 (59.3%) 4487 (40.7%)
8 father_cob [integer] Mean (sd) : 1.4 (1) min < med < max: -1 < 1 < 4 IQR (CV) : 0 (0.7)
-1:5(0.1%)
1:5209(80.7%)
2:491(7.6%)
3:33(0.5%)
4:714(11.1%)
6452 (58.6%) 4561 (41.4%)
9 father_cit [integer] Mean (sd) : 1.3 (0.8) min < med < max: -1 < 1 < 4 IQR (CV) : 0 (0.6)
-1:27(0.4%)
1:5682(87.0%)
2:373(5.7%)
3:22(0.3%)
4:426(6.5%)
6530 (59.3%) 4483 (40.7%)
10 mother_cob [integer] Mean (sd) : 1.4 (0.9) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.7)
1:5417(82.5%)
2:462(7.0%)
3:30(0.5%)
4:660(10.0%)
6569 (59.6%) 4444 (40.4%)
11 mother_cit [integer] Mean (sd) : 1.2 (0.8) min < med < max: -1 < 1 < 4 IQR (CV) : 0 (0.6)
-1:23(0.3%)
1:5941(88.4%)
2:306(4.6%)
3:24(0.4%)
4:428(6.4%)
6722 (61.0%) 4291 (39.0%)
12 father_edu [integer] Mean (sd) : 1.1 (1) min < med < max: -1 < 1 < 3 IQR (CV) : 0 (0.9)
-1:554(8.4%)
0:242(3.7%)
1:4480(68.1%)
2:554(8.4%)
3:753(11.4%)
6583 (59.8%) 4430 (40.2%)
13 mother_edu [integer] Mean (sd) : 1.1 (0.8) min < med < max: -1 < 1 < 3 IQR (CV) : 0 (0.8)
-1:365(5.5%)
0:339(5.1%)
1:4750(72.0%)
2:611(9.3%)
3:535(8.1%)
6600 (59.9%) 4413 (40.1%)
14 father_occup_stat [integer] Mean (sd) : 1.2 (0.8) min < med < max: -1 < 1 < 6 IQR (CV) : 0 (0.6)
-1:134(2.1%)
1:4824(77.1%)
2:1137(18.2%)
3:42(0.7%)
4:37(0.6%)
5:5(0.1%)
6:79(1.3%)
6258 (56.8%) 4755 (43.2%)
15 mother_occup_stat [integer] Mean (sd) : 2.8 (2) min < med < max: -1 < 2 < 6 IQR (CV) : 4 (0.7)
-1:93(1.4%)
1:3134(48.3%)
2:526(8.1%)
3:13(0.2%)
4:10(0.2%)
5:2612(40.2%)
6:107(1.6%)
6495 (59.0%) 4518 (41.0%)
16 father_occup [integer] Mean (sd) : 5.3 (3) min < med < max: -1 < 6 < 9 IQR (CV) : 5 (0.6) 11 distinct values 6061 (55.0%) 4952 (45.0%)
17 mother_occup [integer] Mean (sd) : 5.1 (2.5) min < med < max: -1 < 5 < 9 IQR (CV) : 3 (0.5) 11 distinct values 3795 (34.5%) 7218 (65.5%)
18 father_manag [integer] Mean (sd) : 1.5 (0.8) min < med < max: -1 < 2 < 2 IQR (CV) : 1 (0.5)
-1:356(5.8%)
1:2223(36.0%)
2:3591(58.2%)
6170 (56.0%) 4843 (44.0%)
19 mother_manag [integer] Mean (sd) : 1.8 (0.6) min < med < max: -1 < 2 < 2 IQR (CV) : 0 (0.4)
-1:142(3.8%)
1:494(13.1%)
2:3121(83.1%)
3757 (34.1%) 7256 (65.9%)
20 tenancy [integer] Mean (sd) : 1.4 (0.6) min < med < max: -1 < 1 < 3 IQR (CV) : 1 (0.5)
-1:100(1.5%)
1:4230(64.0%)
2:2009(30.4%)
3:269(4.1%)
6608 (60.0%) 4405 (40.0%)
21 monthly_income [integer] 1 distinct value
1:11013(100.0%)
11013 (100.0%) 0 (0.0%)
22 age [numeric] Mean (sd) : 43.1 (9.4) min < med < max: 27 < 43 < 59 IQR (CV) : 16 (0.2) 33 distinct values 11013 (100.0%) 0 (0.0%)
23 log_income [numeric] Mean (sd) : 10.7 (0.9) min < med < max: 0 < 10.8 < 14.1 IQR (CV) : 0.8 (0.1) 6743 distinct values 11013 (100.0%) 0 (0.0%)
24 citizenship [factor] 1. 1 2. 2 3. 3
223(2.0%)
10328(93.8%)
462(4.2%)
11013 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.8 (R version 4.0.3)
2021-02-16

print(dfSummary(DK_equality_data), method="render") #We have maybe too many missing values for Denmark

Data Frame Summary

DK_equality_data

Dimensions: 4536 x 24
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 year_of_birth [integer] Mean (sd) : 1967.5 (9.2) min < med < max: 1952 < 1967 < 1984 IQR (CV) : 15 (0) 33 distinct values 4536 (100.0%) 0 (0.0%)
2 annual_income [integer] Mean (sd) : 82526 (48369.5) min < med < max: -112975 < 77951 < 532044 IQR (CV) : 55764.5 (0.6) 2983 distinct values 4536 (100.0%) 0 (0.0%)
3 sex [integer] Min : 1 Mean : 1.5 Max : 2
1:2301(50.7%)
2:2235(49.3%)
4536 (100.0%) 0 (0.0%)
4 PB220A [character] 1. DK 2. EU 3. Other
4318(95.2%)
100(2.2%)
118(2.6%)
4536 (100.0%) 0 (0.0%)
5 parents_present [integer] Mean (sd) : 1.4 (0.8) min < med < max: 1 < 1 < 5 IQR (CV) : 0 (0.6)
1:1478(82.8%)
2:43(2.4%)
3:228(12.8%)
4:15(0.8%)
5:22(1.2%)
1786 (39.4%) 2750 (60.6%)
6 adults_home [integer] Mean (sd) : 2.2 (0.8) min < med < max: 1 < 2 < 8 IQR (CV) : 0 (0.3)
1:176(9.5%)
2:1289(69.5%)
3:287(15.5%)
4:77(4.2%)
5:12(0.6%)
6:7(0.4%)
7:4(0.2%)
8:2(0.1%)
1854 (40.9%) 2682 (59.1%)
7 children_home [integer] Mean (sd) : 2.1 (1.1) min < med < max: 1 < 2 < 9 IQR (CV) : 2 (0.5)
1:587(32.0%)
2:714(39.0%)
3:355(19.4%)
4:112(6.1%)
5:31(1.7%)
6:22(1.2%)
7:6(0.3%)
8:5(0.3%)
9:1(0.1%)
1833 (40.4%) 2703 (59.6%)
8 father_cob [integer] Mean (sd) : 1.1 (0.5) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.4)
1:1719(94.8%)
2:47(2.6%)
3:11(0.6%)
4:37(2.0%)
1814 (40.0%) 2722 (60.0%)
9 father_cit [integer] Mean (sd) : 1 (0.2) min < med < max: 1 < 1 < 3 IQR (CV) : 0 (0.2)
1:1715(97.3%)
2:31(1.8%)
3:16(0.9%)
1762 (38.8%) 2774 (61.2%)
10 mother_cob [integer] Mean (sd) : 1.1 (0.6) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.5)
1:1708(92.5%)
2:62(3.4%)
3:21(1.1%)
4:56(3.0%)
1847 (40.7%) 2689 (59.3%)
11 mother_cit [integer] Mean (sd) : 1.1 (0.5) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.5)
1:1763(94.6%)
2:35(1.9%)
3:14(0.8%)
4:51(2.7%)
1863 (41.1%) 2673 (58.9%)
12 father_edu [integer] Mean (sd) : 1.9 (0.8) min < med < max: 1 < 2 < 3 IQR (CV) : 2 (0.4)
1:522(33.4%)
2:650(41.5%)
3:393(25.1%)
1565 (34.5%) 2971 (65.5%)
13 mother_edu [integer] Mean (sd) : 1.7 (0.8) min < med < max: 1 < 2 < 3 IQR (CV) : 1 (0.5)
1:836(47.5%)
2:551(31.3%)
3:373(21.2%)
1760 (38.8%) 2776 (61.2%)
14 father_occup_stat [integer] Mean (sd) : 1.3 (0.6) min < med < max: 1 < 1 < 6 IQR (CV) : 1 (0.4)
1:1125(72.1%)
2:403(25.8%)
3:9(0.6%)
4:20(1.3%)
5:1(0.1%)
6:2(0.1%)
1560 (34.4%) 2976 (65.6%)
15 mother_occup_stat [integer] Mean (sd) : 2.2 (1.8) min < med < max: 1 < 1 < 6 IQR (CV) : 4 (0.8)
1:1105(64.3%)
2:117(6.8%)
3:9(0.5%)
4:29(1.7%)
5:445(25.9%)
6:14(0.8%)
1719 (37.9%) 2817 (62.1%)
16 father_occup [integer] Mean (sd) : 5 (2.4) min < med < max: 1 < 6 < 9 IQR (CV) : 5 (0.5)
1:183(12.3%)
2:190(12.8%)
3:123(8.3%)
4:51(3.4%)
5:125(8.4%)
6:238(16.0%)
7:448(30.1%)
8:110(7.4%)
9:19(1.3%)
1487 (32.8%) 3049 (67.2%)
17 mother_occup [integer] Mean (sd) : 4.1 (1.6) min < med < max: 1 < 4 < 9 IQR (CV) : 2 (0.4)
1:34(2.9%)
2:201(17.4%)
3:172(14.9%)
4:219(18.9%)
5:366(31.7%)
6:56(4.8%)
7:75(6.5%)
8:32(2.8%)
9:1(0.1%)
1156 (25.5%) 3380 (74.5%)
18 father_manag [integer] Min : 1 Mean : 1.5 Max : 2
1:697(46.1%)
2:814(53.9%)
1511 (33.3%) 3025 (66.7%)
19 mother_manag [integer] Min : 1 Mean : 1.8 Max : 2
1:228(18.8%)
2:987(81.2%)
1215 (26.8%) 3321 (73.2%)
20 tenancy [integer] Mean (sd) : 1.3 (0.5) min < med < max: 1 < 1 < 3 IQR (CV) : 0 (0.4)
1:1359(75.3%)
2:433(24.0%)
3:13(0.7%)
1805 (39.8%) 2731 (60.2%)
21 monthly_income [integer] 1 distinct value
1:4536(100.0%)
4536 (100.0%) 0 (0.0%)
22 age [numeric] Mean (sd) : 43.5 (9.2) min < med < max: 27 < 44 < 59 IQR (CV) : 15 (0.2) 33 distinct values 4536 (100.0%) 0 (0.0%)
23 log_income [numeric] Mean (sd) : 11.2 (0.7) min < med < max: 3.5 < 11.3 < 13.2 IQR (CV) : 0.7 (0.1) 2956 distinct values 4502 (99.3%) 34 (0.7%)
24 citizenship [factor] 1. 1 2. 2 3. 3
4318(95.2%)
100(2.2%)
118(2.6%)
4536 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.8 (R version 4.0.3)
2021-02-16

print(dfSummary(ES_equality_data), method="render")

Data Frame Summary

ES_equality_data

Dimensions: 17160 x 24
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 year_of_birth [integer] Mean (sd) : 1968.4 (9) min < med < max: 1952 < 1969 < 1984 IQR (CV) : 15 (0) 33 distinct values 17160 (100.0%) 0 (0.0%)
2 annual_income [integer] Mean (sd) : 42864.5 (32836.6) min < med < max: -61984 < 36351 < 533691 IQR (CV) : 35266.2 (0.8) 9103 distinct values 17160 (100.0%) 0 (0.0%)
3 sex [integer] Min : 1 Mean : 1.5 Max : 2
1:8701(50.7%)
2:8459(49.3%)
17160 (100.0%) 0 (0.0%)
4 PB220A [character] 1. ES 2. EU 3. Other
14628(85.2%)
777(4.5%)
1755(10.2%)
17160 (100.0%) 0 (0.0%)
5 parents_present [integer] Mean (sd) : 1.2 (0.8) min < med < max: 1 < 1 < 5 IQR (CV) : 0 (0.6)
1:10052(89.3%)
2:145(1.3%)
3:617(5.5%)
4:369(3.3%)
5:79(0.7%)
11262 (65.6%) 5898 (34.4%)
6 adults_home [integer] Mean (sd) : 2.8 (1.3) min < med < max: 0 < 2 < 14 IQR (CV) : 2 (0.5) 15 distinct values 11170 (65.1%) 5990 (34.9%)
7 children_home [integer] Mean (sd) : 2.4 (1.4) min < med < max: 1 < 2 < 12 IQR (CV) : 2 (0.6) 12 distinct values 11023 (64.2%) 6137 (35.8%)
8 father_cob [integer] Mean (sd) : 1.4 (0.9) min < med < max: -1 < 1 < 4 IQR (CV) : 0 (0.7)
-1:73(0.6%)
1:9477(83.8%)
2:534(4.7%)
3:65(0.6%)
4:1158(10.2%)
11307 (65.9%) 5853 (34.1%)
9 father_cit [integer] Mean (sd) : 1.3 (0.9) min < med < max: -1 < 1 < 4 IQR (CV) : 0 (0.7)
-1:16(0.1%)
1:9534(85.6%)
2:493(4.4%)
3:52(0.5%)
4:1040(9.3%)
11135 (64.9%) 6025 (35.1%)
10 mother_cob [integer] Mean (sd) : 1.3 (0.9) min < med < max: -1 < 1 < 4 IQR (CV) : 0 (0.7)
-1:57(0.5%)
1:9461(85.0%)
2:452(4.1%)
3:56(0.5%)
4:1107(9.9%)
11133 (64.9%) 6027 (35.1%)
11 mother_cit [integer] Mean (sd) : 1.3 (0.9) min < med < max: -1 < 1 < 4 IQR (CV) : 0 (0.7)
-1:35(0.3%)
1:9628(85.7%)
2:505(4.5%)
3:48(0.4%)
4:1023(9.1%)
11239 (65.5%) 5921 (34.5%)
12 father_edu [integer] Mean (sd) : 1.2 (0.8) min < med < max: -1 < 1 < 3 IQR (CV) : 0 (0.7)
-1:375(3.3%)
0:504(4.5%)
1:8447(75.1%)
2:848(7.5%)
3:1073(9.5%)
11247 (65.5%) 5913 (34.5%)
13 mother_edu [integer] Mean (sd) : 1 (0.6) min < med < max: -1 < 1 < 3 IQR (CV) : 0 (0.6)
-1:277(2.5%)
0:769(6.8%)
1:9027(80.0%)
2:681(6.0%)
3:534(4.7%)
11288 (65.8%) 5872 (34.2%)
14 father_occup_stat [integer] Mean (sd) : 1.3 (0.7) min < med < max: -1 < 1 < 6 IQR (CV) : 0 (0.5)
-1:60(0.6%)
1:7924(74.5%)
2:2359(22.2%)
3:42(0.4%)
4:173(1.6%)
5:13(0.1%)
6:65(0.6%)
10636 (62.0%) 6524 (38.0%)
15 mother_occup_stat [integer] Mean (sd) : 3.9 (1.7) min < med < max: -1 < 5 < 6 IQR (CV) : 3 (0.4)
-1:54(0.5%)
1:2263(20.7%)
2:764(7.0%)
3:21(0.2%)
4:27(0.2%)
5:7741(70.9%)
6:54(0.5%)
10924 (63.7%) 6236 (36.3%)
16 father_occup [integer] Mean (sd) : 5.6 (2.6) min < med < max: -1 < 6 < 9 IQR (CV) : 4 (0.5) 11 distinct values 10357 (60.4%) 6803 (39.6%)
17 mother_occup [integer] Mean (sd) : 5.7 (2.6) min < med < max: -1 < 5 < 9 IQR (CV) : 5 (0.5)
-1:36(1.2%)
1:132(4.3%)
2:339(11.1%)
3:129(4.2%)
4:283(9.3%)
5:675(22.2%)
6:295(9.7%)
7:220(7.2%)
8:123(4.0%)
9:810(26.6%)
3042 (17.7%) 14118 (82.3%)
18 father_manag [integer] Mean (sd) : 1.8 (0.4) min < med < max: -1 < 2 < 2 IQR (CV) : 0 (0.2)
-1:20(0.2%)
1:2235(21.5%)
2:8125(78.3%)
10380 (60.5%) 6780 (39.5%)
19 mother_manag [integer] Mean (sd) : 1.9 (0.3) min < med < max: -1 < 2 < 2 IQR (CV) : 0 (0.2)
-1:2(0.1%)
1:377(12.5%)
2:2648(87.5%)
3027 (17.6%) 14133 (82.4%)
20 tenancy [integer] Mean (sd) : 1.2 (0.5) min < med < max: -1 < 1 < 3 IQR (CV) : 0 (0.4)
-1:42(0.4%)
1:9211(82.1%)
2:1614(14.4%)
3:355(3.2%)
11222 (65.4%) 5938 (34.6%)
21 monthly_income [integer] Mean (sd) : 1570.2 (605.9) min < med < max: 1 < 1928 < 1928 IQR (CV) : 594.2 (0.4) 1535 distinct values 17160 (100.0%) 0 (0.0%)
22 age [numeric] Mean (sd) : 42.6 (9) min < med < max: 27 < 42 < 59 IQR (CV) : 15 (0.2) 33 distinct values 17160 (100.0%) 0 (0.0%)
23 log_income [numeric] Mean (sd) : 10.3 (1.3) min < med < max: 0 < 10.5 < 13.2 IQR (CV) : 1 (0.1) 9051 distinct values 17071 (99.5%) 89 (0.5%)
24 citizenship [factor] 1. 1 2. 2 3. 3
14628(85.2%)
777(4.5%)
1755(10.2%)
17160 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.8 (R version 4.0.3)
2021-02-16

print(dfSummary(FI_equality_data), method="render") #We have maybe too many missing values for Finland

Data Frame Summary

FI_equality_data

Dimensions: 8342 x 24
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 year_of_birth [integer] Mean (sd) : 1967.5 (9.7) min < med < max: 1952 < 1967 < 1984 IQR (CV) : 17 (0) 33 distinct values 8342 (100.0%) 0 (0.0%)
2 annual_income [integer] Mean (sd) : 62540.8 (39466.2) min < med < max: -19864 < 56088 < 589133 IQR (CV) : 43639.2 (0.6) 5362 distinct values 8342 (100.0%) 0 (0.0%)
3 sex [integer] Min : 1 Mean : 1.5 Max : 2
1:4236(50.8%)
2:4106(49.2%)
8342 (100.0%) 0 (0.0%)
4 PB220A [character] 1. EU 2. FI 3. Other
100(1.2%)
8071(96.8%)
171(2.0%)
8342 (100.0%) 0 (0.0%)
5 parents_present [integer] Mean (sd) : 1.3 (0.8) min < med < max: 1 < 1 < 5 IQR (CV) : 0 (0.6)
1:2514(83.5%)
2:47(1.6%)
3:390(13.0%)
4:7(0.2%)
5:51(1.7%)
3009 (36.1%) 5333 (63.9%)
6 adults_home [integer] Mean (sd) : 2.3 (0.9) min < med < max: 0 < 2 < 8 IQR (CV) : 1 (0.4)
0:1(0.0%)
1:296(10.0%)
2:1769(59.6%)
3:629(21.2%)
4:213(7.2%)
5:40(1.3%)
6:14(0.5%)
7:5(0.2%)
8:2(0.1%)
2969 (35.6%) 5373 (64.4%)
7 children_home [integer] Mean (sd) : 2.2 (1.3) min < med < max: 0 < 2 < 15 IQR (CV) : 2 (0.6) 14 distinct values 2938 (35.2%) 5404 (64.8%)
8 father_cob [integer] Mean (sd) : 1.1 (0.4) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.4)
1:2509(96.1%)
2:33(1.3%)
3:25(1.0%)
4:43(1.6%)
2610 (31.3%) 5732 (68.7%)
9 father_cit [integer] Mean (sd) : 1.1 (0.4) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.4)
1:2441(96.7%)
2:30(1.2%)
3:18(0.7%)
4:36(1.4%)
2525 (30.3%) 5817 (69.7%)
10 mother_cob [integer] Mean (sd) : 1.1 (0.4) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.3)
1:2857(97.4%)
2:20(0.7%)
3:24(0.8%)
4:32(1.1%)
2933 (35.2%) 5409 (64.8%)
11 mother_cit [integer] Mean (sd) : 1.1 (0.4) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.4)
1:2793(97.3%)
2:19(0.7%)
3:25(0.9%)
4:34(1.2%)
2871 (34.4%) 5471 (65.6%)
12 father_edu [integer] Mean (sd) : 1.7 (0.9) min < med < max: -1 < 1 < 3 IQR (CV) : 1 (0.5)
-1:11(0.4%)
0:51(2.0%)
1:1305(50.7%)
2:606(23.6%)
3:599(23.3%)
2572 (30.8%) 5770 (69.2%)
13 mother_edu [integer] Mean (sd) : 1.6 (0.8) min < med < max: -1 < 1 < 3 IQR (CV) : 1 (0.5)
-1:13(0.4%)
0:61(2.1%)
1:1531(52.3%)
2:792(27.0%)
3:531(18.1%)
2928 (35.1%) 5414 (64.9%)
14 father_occup_stat [integer] Mean (sd) : 1.4 (0.8) min < med < max: -1 < 1 < 6 IQR (CV) : 1 (0.6)
-1:11(0.5%)
1:1558(70.9%)
2:497(22.6%)
3:64(2.9%)
4:35(1.6%)
5:2(0.1%)
6:31(1.4%)
2198 (26.3%) 6144 (73.7%)
15 mother_occup_stat [integer] Mean (sd) : 1.5 (1.1) min < med < max: -1 < 1 < 6 IQR (CV) : 1 (0.7)
-1:4(0.2%)
1:1626(72.3%)
2:391(17.4%)
3:76(3.4%)
4:15(0.7%)
5:110(4.9%)
6:26(1.2%)
2248 (26.9%) 6094 (73.1%)
16 father_occup [integer] Mean (sd) : 3.1 (3.6) min < med < max: -1 < 3 < 9 IQR (CV) : 8 (1.2) 11 distinct values 2379 (28.5%) 5963 (71.5%)
17 mother_occup [integer] Mean (sd) : 5.2 (2.5) min < med < max: 1 < 5 < 9 IQR (CV) : 5 (0.5)
1:29(1.6%)
2:286(15.6%)
3:205(11.2%)
4:289(15.8%)
5:354(19.3%)
6:97(5.3%)
7:90(4.9%)
8:108(5.9%)
9:375(20.5%)
1833 (22.0%) 6509 (78.0%)
18 father_manag [integer] 1 distinct value
-1:2074(100.0%)
2074 (24.9%) 6268 (75.1%)
19 mother_manag [integer] 1 distinct value
-1:2029(100.0%)
2029 (24.3%) 6313 (75.7%)
20 tenancy [integer] Mean (sd) : 1.2 (0.5) min < med < max: -1 < 1 < 2 IQR (CV) : 0 (0.4)
-1:24(0.8%)
1:2289(78.0%)
2:620(21.1%)
2933 (35.2%) 5409 (64.8%)
21 monthly_income [integer] 1 distinct value
1:8342(100.0%)
8342 (100.0%) 0 (0.0%)
22 age [numeric] Mean (sd) : 43.5 (9.7) min < med < max: 27 < 44 < 59 IQR (CV) : 17 (0.2) 33 distinct values 8342 (100.0%) 0 (0.0%)
23 log_income [numeric] Mean (sd) : 10.8 (0.9) min < med < max: 0 < 10.9 < 13.3 IQR (CV) : 0.8 (0.1) 5359 distinct values 8338 (100.0%) 4 (0.0%)
24 citizenship [factor] 1. 1 2. 2 3. 3
100(1.2%)
8071(96.8%)
171(2.0%)
8342 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.8 (R version 4.0.3)
2021-02-16

print(dfSummary(IT_equality_data), method="render")

Data Frame Summary

IT_equality_data

Dimensions: 38223 x 24
Duplicates: 4
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 year_of_birth [integer] Mean (sd) : 1969.8 (9.3) min < med < max: 1952 < 1970 < 1984 IQR (CV) : 16 (0) 33 distinct values 38223 (100.0%) 0 (0.0%)
2 annual_income [integer] Mean (sd) : 34430843 (44666780) min < med < max: 1153382 < 15050399 < 129812463 IQR (CV) : 23645923 (1.3) 113 distinct values 38223 (100.0%) 0 (0.0%)
3 sex [integer] Min : 1 Mean : 1.5 Max : 2
1:19385(50.7%)
2:18838(49.3%)
38223 (100.0%) 0 (0.0%)
4 PB220A [character] 1. EU 2. IT 3. Other
1616(4.2%)
34087(89.2%)
2520(6.6%)
38223 (100.0%) 0 (0.0%)
5 parents_present [integer] Mean (sd) : 1.2 (0.8) min < med < max: 1 < 1 < 5 IQR (CV) : 0 (0.7)
1:20917(90.1%)
2:262(1.1%)
3:1224(5.3%)
4:133(0.6%)
5:670(2.9%)
23206 (60.7%) 15017 (39.3%)
6 adults_home [integer] Mean (sd) : 2.6 (1.1) min < med < max: 1 < 2 < 19 IQR (CV) : 1 (0.4) 14 distinct values 22474 (58.8%) 15749 (41.2%)
7 children_home [integer] Mean (sd) : 2.4 (1.4) min < med < max: 1 < 2 < 13 IQR (CV) : 2 (0.6) 13 distinct values 22529 (58.9%) 15694 (41.1%)
8 father_cob [integer] Mean (sd) : 1.2 (0.7) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.6)
1:19317(90.6%)
2:498(2.3%)
3:501(2.4%)
4:1001(4.7%)
21317 (55.8%) 16906 (44.2%)
9 father_cit [integer] Mean (sd) : 1.2 (0.7) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.6)
1:19034(90.8%)
2:486(2.3%)
3:533(2.5%)
4:911(4.3%)
20964 (54.8%) 17259 (45.2%)
10 mother_cob [integer] Mean (sd) : 1.2 (0.7) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.6)
1:19920(89.8%)
2:628(2.8%)
3:524(2.4%)
4:1102(5.0%)
22174 (58.0%) 16049 (42.0%)
11 mother_cit [integer] Mean (sd) : 1.2 (0.7) min < med < max: 1 < 1 < 4 IQR (CV) : 0 (0.6)
1:19860(90.1%)
2:551(2.5%)
3:523(2.4%)
4:1098(5.0%)
22032 (57.6%) 16191 (42.4%)
12 father_edu [integer] Mean (sd) : 1.2 (0.6) min < med < max: 0 < 1 < 3 IQR (CV) : 0 (0.5)
0:600(2.8%)
1:15967(75.6%)
2:3579(16.9%)
3:978(4.6%)
21124 (55.3%) 17099 (44.7%)
13 mother_edu [integer] Mean (sd) : 1.2 (0.5) min < med < max: 0 < 1 < 3 IQR (CV) : 0 (0.4)
0:822(3.7%)
1:17546(79.3%)
2:3120(14.1%)
3:634(2.9%)
22122 (57.9%) 16101 (42.1%)
14 father_occup_stat [integer] Mean (sd) : 1.5 (0.9) min < med < max: 1 < 1 < 6 IQR (CV) : 1 (0.6)
1:14302(67.3%)
2:5734(27.0%)
3:346(1.6%)
4:356(1.7%)
5:84(0.4%)
6:419(2.0%)
21241 (55.6%) 16982 (44.4%)
15 mother_occup_stat [integer] Mean (sd) : 3.7 (1.8) min < med < max: 1 < 5 < 6 IQR (CV) : 4 (0.5)
1:5568(25.0%)
2:1897(8.5%)
3:126(0.6%)
4:142(0.6%)
5:14431(64.9%)
6:75(0.3%)
22239 (58.2%) 15984 (41.8%)
16 father_occup [integer] Mean (sd) : 5.7 (2.6) min < med < max: -1 < 7 < 9 IQR (CV) : 3 (0.5) 11 distinct values 19993 (52.3%) 18230 (47.7%)
17 mother_occup [integer] Mean (sd) : 5.4 (2.6) min < med < max: -1 < 5 < 9 IQR (CV) : 5 (0.5) 11 distinct values 7482 (19.6%) 30741 (80.4%)
18 father_manag [integer] Min : 1 Mean : 1.8 Max : 2
1:4802(24.1%)
2:15133(75.9%)
19935 (52.2%) 18288 (47.8%)
19 mother_manag [integer] Min : 1 Mean : 1.9 Max : 2
1:1034(13.8%)
2:6437(86.2%)
7471 (19.5%) 30752 (80.5%)
20 tenancy [integer] Mean (sd) : 1.3 (0.5) min < med < max: 1 < 1 < 3 IQR (CV) : 1 (0.4)
1:15564(68.7%)
2:6277(27.7%)
3:806(3.6%)
22647 (59.2%) 15576 (40.8%)
21 monthly_income [integer] Mean (sd) : 1335.3 (543.3) min < med < max: 1 < 1675 < 1675 IQR (CV) : 715 (0.4) 1459 distinct values 38223 (100.0%) 0 (0.0%)
22 age [numeric] Mean (sd) : 41.2 (9.3) min < med < max: 27 < 41 < 59 IQR (CV) : 16 (0.2) 33 distinct values 38223 (100.0%) 0 (0.0%)
23 log_income [numeric] Mean (sd) : 16.7 (1.1) min < med < max: 14 < 16.5 < 18.7 IQR (CV) : 1.3 (0.1) 113 distinct values 38223 (100.0%) 0 (0.0%)
24 citizenship [factor] 1. 1 2. 2 3. 3
1616(4.2%)
34087(89.2%)
2520(6.6%)
38223 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.8 (R version 4.0.3)
2021-02-16

print(dfSummary(LV_equality_data), method="render")

Data Frame Summary

LV_equality_data

Dimensions: 7288 x 24
Duplicates: 0
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 year_of_birth [integer] Mean (sd) : 1967.8 (9.6) min < med < max: 1952 < 1968 < 1984 IQR (CV) : 17 (0) 33 distinct values 7288 (100.0%) 0 (0.0%)
2 annual_income [integer] Mean (sd) : 15121.8 (11704.9) min < med < max: 0 < 12549 < 125579 IQR (CV) : 13676.5 (0.8) 4007 distinct values 7288 (100.0%) 0 (0.0%)
3 sex [integer] Min : 1 Mean : 1.5 Max : 2
1:3488(47.9%)
2:3800(52.1%)
7288 (100.0%) 0 (0.0%)
4 PB220A [character] 1. EU 2. LV 3. Other
21(0.3%)
6049(83.0%)
1218(16.7%)
7288 (100.0%) 0 (0.0%)
5 parents_present [integer] Mean (sd) : 1.5 (0.9) min < med < max: 1 < 1 < 5 IQR (CV) : 0 (0.6)
1:3550(76.4%)
2:75(1.6%)
3:895(19.3%)
4:84(1.8%)
5:40(0.9%)
4644 (63.7%) 2644 (36.3%)
6 adults_home [integer] Mean (sd) : 2 (0.7) min < med < max: 0 < 2 < 11 IQR (CV) : 0 (0.3) 11 distinct values 4546 (62.4%) 2742 (37.6%)
7 children_home [integer] Mean (sd) : 2.2 (1.2) min < med < max: 1 < 2 < 13 IQR (CV) : 2 (0.6) 12 distinct values 4537 (62.3%) 2751 (37.7%)
8 father_cob [integer] Mean (sd) : 1.3 (0.5) min < med < max: -1 < 1 < 2 IQR (CV) : 1 (0.4)
-1:9(0.2%)
1:2716(71.4%)
2:1079(28.4%)
3804 (52.2%) 3484 (47.8%)
9 father_cit [integer] Mean (sd) : 1.2 (0.5) min < med < max: -1 < 1 < 2 IQR (CV) : 0 (0.4)
-1:56(1.6%)
1:2834(79.7%)
2:667(18.8%)
3557 (48.8%) 3731 (51.2%)
10 mother_cob [integer] Mean (sd) : 1.3 (0.4) min < med < max: -1 < 1 < 2 IQR (CV) : 1 (0.4)
-1:3(0.1%)
1:3302(73.2%)
2:1209(26.8%)
4514 (61.9%) 2774 (38.1%)
11 mother_cit [integer] Mean (sd) : 1.2 (0.4) min < med < max: -1 < 1 < 2 IQR (CV) : 0 (0.4)
-1:26(0.6%)
1:3403(81.5%)
2:749(17.9%)
4178 (57.3%) 3110 (42.7%)
12 father_edu [integer] Mean (sd) : 1.6 (0.9) min < med < max: -1 < 2 < 3 IQR (CV) : 1 (0.6)
-1:185(4.9%)
1:1570(42.0%)
2:1456(38.9%)
3:529(14.1%)
3740 (51.3%) 3548 (48.7%)
13 mother_edu [integer] Mean (sd) : 1.7 (0.8) min < med < max: -1 < 2 < 3 IQR (CV) : 1 (0.5)
-1:140(3.1%)
0:20(0.4%)
1:1691(37.7%)
2:1950(43.5%)
3:680(15.2%)
4481 (61.5%) 2807 (38.5%)
14 father_occup_stat [integer] Mean (sd) : 1 (0.6) min < med < max: -1 < 1 < 6 IQR (CV) : 0 (0.6)
-1:83(2.3%)
1:3392(94.0%)
2:43(1.2%)
3:20(0.6%)
4:38(1.1%)
5:12(0.3%)
6:19(0.5%)
3607 (49.5%) 3681 (50.5%)
15 mother_occup_stat [integer] Mean (sd) : 1.3 (1.1) min < med < max: -1 < 1 < 6 IQR (CV) : 0 (0.9)
-1:76(1.7%)
1:3981(89.7%)
2:26(0.6%)
3:22(0.5%)
4:20(0.5%)
5:299(6.7%)
6:13(0.3%)
4437 (60.9%) 2851 (39.1%)
16 father_occup [integer] Mean (sd) : 6 (2.7) min < med < max: -1 < 7 < 9 IQR (CV) : 4 (0.5) 11 distinct values 3605 (49.5%) 3683 (50.5%)
17 mother_occup [integer] Mean (sd) : 5.1 (2.8) min < med < max: -1 < 5 < 9 IQR (CV) : 4 (0.5) 11 distinct values 4070 (55.8%) 3218 (44.2%)
18 father_manag [integer] Mean (sd) : 1.6 (0.9) min < med < max: -1 < 2 < 2 IQR (CV) : 0 (0.6)
-1:343(9.6%)
1:309(8.6%)
2:2924(81.8%)
3576 (49.1%) 3712 (50.9%)
19 mother_manag [integer] Mean (sd) : 1.6 (0.9) min < med < max: -1 < 2 < 2 IQR (CV) : 0 (0.6)
-1:414(10.2%)
1:361(8.9%)
2:3287(80.9%)
4062 (55.7%) 3226 (44.3%)
20 tenancy [integer] Mean (sd) : 1.8 (0.8) min < med < max: 1 < 2 < 3 IQR (CV) : 1 (0.5)
1:2097(46.0%)
2:1343(29.4%)
3:1123(24.6%)
4563 (62.6%) 2725 (37.4%)
21 monthly_income [integer] 1 distinct value
1:7288(100.0%)
7288 (100.0%) 0 (0.0%)
22 age [numeric] Mean (sd) : 43.2 (9.6) min < med < max: 27 < 43 < 59 IQR (CV) : 17 (0.2) 33 distinct values 7288 (100.0%) 0 (0.0%)
23 log_income [numeric] Mean (sd) : 9.2 (1.4) min < med < max: 0 < 9.4 < 11.7 IQR (CV) : 1.1 (0.2) 4007 distinct values 7288 (100.0%) 0 (0.0%)
24 citizenship [factor] 1. 1 2. 2 3. 3
21(0.3%)
6049(83.0%)
1218(16.7%)
7288 (100.0%) 0 (0.0%)

Generated by summarytools 0.9.8 (R version 4.0.3)
2021-02-16

Summary Statisitcs All Countries

## Joining, by = c("Country", "Sample Size", "Avg. Equ.Income", "Std. dev.", "Gini")
## Joining, by = c("Country", "Sample Size", "Avg. Equ.Income", "Std. dev.", "Gini")
## Joining, by = c("Country", "Sample Size", "Avg. Equ.Income", "Std. dev.", "Gini")
## Joining, by = c("Country", "Sample Size", "Avg. Equ.Income", "Std. dev.", "Gini")
## Joining, by = c("Country", "Sample Size", "Avg. Equ.Income", "Std. dev.", "Gini")
## Joining, by = c("Country", "Sample Size", "Avg. Equ.Income", "Std. dev.", "Gini")
Country Statistics
Country Sample Size Avg. Equ.Income Std. dev. Gini
AT 6741 72909.75 51328.24 0.3620218
FR 11013 59673.42 48605.76 0.3540666
DK 4536 82525.95 48369.46 0.3055454
ES 17160 42864.46 32836.60 0.3851535
FI 8342 62540.79 39466.20 0.3236630
IT 38223 34430843.26 44666779.49 0.5799582
LV 7288 15121.82 11704.88 0.4041037

Method: Conditional Inference Trees

Advantages

Advantages of Trees: straightforward to interpret

Advantages of Trees over linear regression models: very large set of observations can be used & model specification is no longer exogenously given

Advantages of Conditional Inference Trees over Regression and Classification Trees (CART): the algorithm automatically provides a test for the null hypothesis of equality of opportunity & prevents overfitting while CART “cannot distinguish between a significant and an insignificant improvement in the information measure” (Mingers 1987, as cited in @hot, 2) & consider the distributional properties of the measures.

Procedure

The algorithm follows a stepwise procedure [@brunori20, 7-8]:

  1. Choose confidence level Test the null hypothesis of independence, \(H_0^{C^p} : D(Y|C^P) = D(Y)\), for each input variable \(C^P \in \hat{\Omega}\), and obtain a p-value associated with each test, \(p^{C^p}\). \(\implies\) We adjust the p-values for multiple hypothesis testing, such that \(p_{adj.}^{C^p} = 1-(1-p^{Cp})^P\), which essentially means that we use the so called Bonferroni Correction.
  2. Choose feature: test all the null hypotheses of independence between the individual outcome and each of all the observable circumstances (variables). The model selects a variable, \(C^*\), with the lowest adjusted p-value. Essentially we choose such that \(C^* = \{C^P : \text{argmin} ~ p_{adj.}^{C^p} \}\).
    1. no hypothesis can be rejected: stop \(\implies\) If \(p_{adj.}^{C^p} > \alpha\): Exit the algorithm.
    2. one or more circumstance is siginificant: select the circumstance with the smallest p-value and proceed \(\rightarrow\) If \(p_{adj.}^{C^p} \leq \alpha\): Continue, and select \(C^*\) as the splitting variable.
  3. Choose split: for every possible way the selected circumstance can divide the sample into two subgroups, test the hypothesis of same mean outcome in the two resulting subgroups. Choose the splitting point with the smallest p-value. Technically, we test the discrepancy between the subsamples for each possible binary partition, s, based on \(C^*\), meaning that \(Y_s = \{Y_i : C^*_i < x^p \}\) and \(Y_{-s} = \{Y_i : C^*_i \geq x^p \}\), and obtain a p-value associated with each test, \(p^{C^*_s}\).

\(\implies\) The the Split sample based on \(C^*_s\), by choosing the split point s that yields the lowest p-value, which is \(C^*_s = \{C^*_s : \text{argmin} ~ p^{C^*_s} \}\). 4. Repeat :)

Regression Trees

Following [@brunori20] we split the data into training and testing data by \(2/3:1/3\). Furthermore, we chose to show the results obtained using regression trees obtained from the ‘rpart’ package. The training and test data sets will be continually used also for further analysis when we proceed with ‘cTree’.

set.seed(123)

AT_equality_data <- AT_equality_data %>%
  mutate(train_index = sample(c("train", "test"), nrow(AT_equality_data), replace=TRUE, prob=c(0.67, 0.33)))

AT_train <- AT_equality_data %>% filter(train_index=="train")
AT_test <- AT_equality_data %>% filter(train_index=="test")

formula <- log_income ~ sex + parents_present + adults_home + children_home + father_cob + father_cit + mother_cob + mother_cit + father_edu + mother_edu + father_occup_stat + mother_occup_stat + father_occup + mother_occup + father_manag + mother_manag + tenancy

AT_tree <- rpart(formula, data = AT_train, cp=.008)

AT_tree
## n= 4536 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 4536 6006.255 10.86120  
##   2) sex>=1.5 2284 4126.054 10.74883 *
##   3) sex< 1.5 2252 1822.114 10.97516 *
rpart.plot(AT_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Austria 2011")

FR_equality_data <- FR_equality_data %>%
  mutate(train_index = sample(c("train", "test"), nrow(FR_equality_data), replace=TRUE, prob=c(0.67, 0.33)))

FR_train <- FR_equality_data %>% filter(train_index=="train")
FR_test <- FR_equality_data %>% filter(train_index=="test")


FR_tree <- rpart(formula, data = FR_train, cp=.003)

FR_tree
## n= 7348 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 7348 5852.9600 10.72465  
##   2) sex>=1.5 3766 3520.8540 10.66340 *
##   3) sex< 1.5 3582 2303.1310 10.78903  
##     6) father_edu< 1.5 3090 2068.1160 10.75670 *
##     7) father_edu>=1.5 492  211.4952 10.99211 *
rpart.plot(FR_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for France 2011")

ES_equality_data <- ES_equality_data %>%
  mutate(train_index = sample(c("train", "test"), nrow(ES_equality_data), replace=TRUE, prob=c(0.67, 0.33)))

ES_train <- ES_equality_data %>% filter(train_index=="train")
ES_test <- ES_equality_data %>% filter(train_index=="test")


ES_tree <- rpart(formula, data = ES_train, cp=.003)

ES_tree
## n=11504 (60 observations deleted due to missingness)
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 11504 20621.54 10.28816 *
rpart.plot(ES_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Spain 2011")

DK_equality_data <- DK_equality_data %>%
  mutate(train_index = sample(c("train", "test"), nrow(DK_equality_data), replace=TRUE, prob=c(0.67, 0.33)))

DK_train <- DK_equality_data %>% filter(train_index=="train")
DK_test <- DK_equality_data %>% filter(train_index=="test")


DK_tree <- rpart(formula, data = DK_train, cp=.003)

DK_tree
## n=3061 (24 observations deleted due to missingness)
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 3061 1395.174 11.15718 *
rpart.plot(DK_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Denmark 2011")

IT_equality_data <- IT_equality_data %>%
  mutate(train_index = sample(c("train", "test"), nrow(IT_equality_data), replace=TRUE, prob=c(0.67, 0.33)))

IT_train <- IT_equality_data %>% filter(train_index=="train")
IT_test <- IT_equality_data %>% filter(train_index=="test")


IT_tree <- rpart(formula, data = IT_train, cp=.003)

IT_tree
## n= 25540 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 25540 32118.68 16.67441  
##   2) sex>=1.5 12576 14422.10 16.59351 *
##   3) sex< 1.5 12964 17534.41 16.75289 *
rpart.plot(IT_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Italy 2011")

FI_equality_data <- FI_equality_data %>%
  mutate(train_index = sample(c("train", "test"), nrow(FI_equality_data), replace=TRUE, prob=c(0.67, 0.33)))

FI_train <- FI_equality_data %>% filter(train_index=="train")
FI_test <- FI_equality_data %>% filter(train_index=="test")


FI_tree <- rpart(formula, data = FI_train, cp=.003)

FI_tree
## n=5612 (4 observations deleted due to missingness)
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 5612 4221.345 10.81804 *
rpart.plot(FI_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Finland 2011")

LV_equality_data <- LV_equality_data %>%
  mutate(train_index = sample(c("train", "test"), nrow(LV_equality_data), replace=TRUE, prob=c(0.67, 0.33)))

LV_train <- LV_equality_data %>% filter(train_index=="train")
LV_test <- LV_equality_data %>% filter(train_index=="test")


LV_tree <- rpart(formula, data = LV_train, cp=.003)

LV_tree
## n= 4851 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 4851 9915.431 9.168963  
##   2) mother_occup>=4.5 3640 8286.543 9.068246 *
##   3) mother_occup< 4.5 1211 1480.978 9.471696 *
rpart.plot(LV_tree, box.palette="RdBu", nn=FALSE, type=2, main = "Regression Tree for Latvia 2011")

Conditional Inference Trees

AT_Ctree <- ctree(formula, data = AT_train)
AT_Ctree
## 
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home + 
##     father_cob + father_cit + mother_cob + mother_cit + father_edu + 
##     mother_edu + father_occup_stat + mother_occup_stat + father_occup + 
##     mother_occup + father_manag + mother_manag + tenancy
## 
## Fitted party:
## [1] root
## |   [2] mother_cob <= 1
## |   |   [3] sex <= 1
## |   |   |   [4] father_edu <= 1: 10.932 (n = 697, err = 628.9)
## |   |   |   [5] father_edu > 1: 11.051 (n = 1079, err = 631.2)
## |   |   [6] sex > 1
## |   |   |   [7] mother_cit <= 2: 10.842 (n = 1514, err = 2235.1)
## |   |   |   [8] mother_cit > 2: 10.566 (n = 223, err = 621.0)
## |   [9] mother_cob > 1
## |   |   [10] sex <= 1
## |   |   |   [11] children_home <= 5
## |   |   |   |   [12] mother_cob <= 2: 11.046 (n = 204, err = 101.4)
## |   |   |   |   [13] mother_cob > 2
## |   |   |   |   |   [14] father_edu <= 1: 10.706 (n = 105, err = 122.6)
## |   |   |   |   |   [15] father_edu > 1
## |   |   |   |   |   |   [16] mother_cit <= 2
## |   |   |   |   |   |   |   [17] mother_manag <= 1: 10.361 (n = 24, err = 128.7)
## |   |   |   |   |   |   |   [18] mother_manag > 1: 10.956 (n = 111, err = 43.0)
## |   |   |   |   |   |   [19] mother_cit > 2: 10.600 (n = 13, err = 9.6)
## |   |   |   [20] children_home > 5: 10.100 (n = 19, err = 115.2)
## |   |   [21] sex > 1: 10.564 (n = 547, err = 1230.6)
## 
## Number of inner nodes:    10
## Number of terminal nodes: 11
plot(AT_Ctree, type = "simple",gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Austria 2011")

Cross Validation using the Caret package

fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10, savePredictions = T)
AT_cctree1 <- train(formula, data = AT_train, method = "ctree", trControl = fitControl, na.action = na.pass)

AT_cctree1 #This is the suggested tree we get from applying Caret
## Conditional Inference Tree 
## 
## 4536 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 4082, 4083, 4084, 4083, 4082, 4083, ... 
## Resampling results across tuning parameters:
## 
##   mincriterion  RMSE      Rsquared     MAE      
##   0.01          1.165198  0.007522094  0.7112571
##   0.50          1.145303  0.008005095  0.6889176
##   0.99          1.138132  0.011057196  0.6829340
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
AT_cct <- ctree(formula, data = AT_train, mincriterion = 0.99) #Using the suggestion we generate a Conditional Inference Tree and plot it as our final result

plot(AT_cct,gp = gpar(fontsize = 8),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Austria 2011 - Cross Validated with Caret")

AT_ctree2 <- ctree(formula, data = AT_equality_data, control = ctree_control(testtype = "Bonferroni", mincriterion = 0.99)) 
AT_ctree2
## 
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home + 
##     father_cob + father_cit + mother_cob + mother_cit + father_edu + 
##     mother_edu + father_occup_stat + mother_occup_stat + father_occup + 
##     mother_occup + father_manag + mother_manag + tenancy
## 
## Fitted party:
## [1] root
## |   [2] sex <= 1
## |   |   [3] mother_cob <= 2
## |   |   |   [4] father_edu <= 0: 10.803 (n = 101, err = 234.8)
## |   |   |   [5] father_edu > 0
## |   |   |   |   [6] mother_cit <= 2
## |   |   |   |   |   [7] father_cit <= 2
## |   |   |   |   |   |   [8] mother_edu <= 1: 11.020 (n = 1301, err = 909.8)
## |   |   |   |   |   |   [9] mother_edu > 1: 11.109 (n = 1011, err = 634.3)
## |   |   |   |   |   [10] father_cit > 2: 10.925 (n = 260, err = 172.8)
## |   |   |   |   [11] mother_cit > 2: 10.891 (n = 278, err = 169.6)
## |   |   [12] mother_cob > 2
## |   |   |   [13] children_home <= 5: 10.788 (n = 357, err = 339.7)
## |   |   |   [14] children_home > 5: 9.958 (n = 12, err = 113.2)
## |   [15] sex > 1
## |   |   [16] mother_cob <= 1
## |   |   |   [17] mother_cit <= 1
## |   |   |   |   [18] father_cit <= 2: 10.867 (n = 1679, err = 2193.3)
## |   |   |   |   [19] father_cit > 2: 10.777 (n = 306, err = 409.7)
## |   |   |   [20] mother_cit > 1: 10.742 (n = 545, err = 853.7)
## |   |   [21] mother_cob > 1: 10.593 (n = 891, err = 2137.9)
## 
## Number of inner nodes:    10
## Number of terminal nodes: 11
plot(AT_ctree2, type = "simple",gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Austria 2011 - Cross Validated with Ctree")

AT_test$P_AtCt <- predict(AT_ctree2, newdata = as.data.frame(AT_test))

AT_test$perror <- (AT_test$P_AtCt - AT_test$log_income)^2

AT_test$RMSE <- sqrt(sum((AT_test$P_AtCt - AT_test$log_income)^2/nrow(AT_test), na.rm = T))

# For Austria we have a RMSE of 1.2, which is not very good. But is most likely attributed to the synthetic data. 

# Plot the Errors somehow
FR_Ctree <- ctree(formula, data = FR_train)
FR_Ctree
## 
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home + 
##     father_cob + father_cit + mother_cob + mother_cit + father_edu + 
##     mother_edu + father_occup_stat + mother_occup_stat + father_occup + 
##     mother_occup + father_manag + mother_manag + tenancy
## 
## Fitted party:
## [1] root
## |   [2] sex <= 1
## |   |   [3] father_edu <= 1
## |   |   |   [4] mother_cit <= 1: 10.777 (n = 2543, err = 1660.3)
## |   |   |   [5] mother_cit > 1: 10.690 (n = 288, err = 168.2)
## |   |   [6] father_edu > 1: 10.868 (n = 751, err = 466.8)
## |   [7] sex > 1
## |   |   [8] tenancy <= 1
## |   |   |   [9] mother_cit <= 2: 10.697 (n = 2183, err = 1924.0)
## |   |   |   [10] mother_cit > 2: 10.607 (n = 171, err = 146.2)
## |   |   [11] tenancy > 1
## |   |   |   [12] father_cit <= 1: 10.647 (n = 1203, err = 1174.1)
## |   |   |   [13] father_cit > 1: 10.452 (n = 209, err = 263.9)
## 
## Number of inner nodes:    6
## Number of terminal nodes: 7
plot(FR_Ctree, type = "simple",gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for France 2011")

FR_cctree <- train(formula, data = FR_train, method = "ctree", trControl = fitControl, na.action = na.pass)

FR_cctree #This is the suggested tree we get from applying Caret
## Conditional Inference Tree 
## 
## 7348 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 6612, 6615, 6613, 6615, 6613, 6612, ... 
## Resampling results across tuning parameters:
## 
##   mincriterion  RMSE       Rsquared     MAE      
##   0.01          0.8969202  0.008025817  0.5931404
##   0.50          0.8893252  0.006947776  0.5815613
##   0.99          0.8872471  0.008028844  0.5785668
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
FR_cct <- ctree(formula, data = FR_train, mincriterion = 0.99) #Using the suggestion we generate a Conditional Inference Tree and plot it as our final result

plot(FR_cct, type = "simple",gp = gpar(fontsize = 8),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for France 2011 - Cross Validated")

FR_test$P_FRCt <- predict(FR_cct, newdata = as.data.frame(FR_test))

FR_test$perror <- (FR_test$P_FRCt - FR_test$log_income)^2

FR_test$RMSE <- sqrt(sum((FR_test$P_FRCt - FR_test$log_income)^2/nrow(FR_test), na.rm = T))

# RMSE 0.8
ES_Ctree <- ctree(formula, data = ES_train)
ES_Ctree
## 
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home + 
##     father_cob + father_cit + mother_cob + mother_cit + father_edu + 
##     mother_edu + father_occup_stat + mother_occup_stat + father_occup + 
##     mother_occup + father_manag + mother_manag + tenancy
## 
## Fitted party:
## [1] root
## |   [2] mother_cit <= 1
## |   |   [3] father_cit <= 1
## |   |   |   [4] sex <= 1
## |   |   |   |   [5] father_cob <= 1
## |   |   |   |   |   [6] father_edu <= 0: 10.273 (n = 268, err = NaN)
## |   |   |   |   |   [7] father_edu > 0: 10.477 (n = 3628, err = NaN)
## |   |   |   |   [8] father_cob > 1: 10.189 (n = 638, err = NaN)
## |   |   |   [9] sex > 1
## |   |   |   |   [10] father_cob <= 1: 10.293 (n = 3396, err = NaN)
## |   |   |   |   [11] father_cob > 1: 10.043 (n = 661, err = NaN)
## |   |   [12] father_cit > 1: 10.157 (n = 1370, err = NaN)
## |   [13] mother_cit > 1: 10.107 (n = 1603, err = NaN)
## 
## Number of inner nodes:    6
## Number of terminal nodes: 7
plot(ES_Ctree, type = "simple",gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Spain 2011")

ES_cctree <- train(formula, data = ES_train, method = "ctree", trControl = fitControl, na.action = na.omit) #The spanish synthetic dataset has many NA`s, the output of the tree is unreliable as we don't have information on the errors

ES_cctree #This is the suggested tree we get from applying Caret
## Conditional Inference Tree 
## 
## 11564 samples
##    17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 6, 6, 6, 6, 6, 6, ... 
## Resampling results across tuning parameters:
## 
##   mincriterion  RMSE       Rsquared  MAE      
##   0.01          0.6213837  NaN       0.6213837
##   0.50          0.6213837  NaN       0.6213837
##   0.99          0.6213837  NaN       0.6213837
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
ES_cct <- ctree(formula, data = ES_train, mincriterion = 0.99) #Using the suggestion we generate a Conditional Inference Tree and plot it as our final result

plot(ES_cct, type = "simple",gp = gpar(fontsize = 8),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Spain 2011 - Cross Validated")

IT_Ctree <- ctree(formula, data = IT_train, control = ctree_control())
IT_Ctree
## 
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home + 
##     father_cob + father_cit + mother_cob + mother_cit + father_edu + 
##     mother_edu + father_occup_stat + mother_occup_stat + father_occup + 
##     mother_occup + father_manag + mother_manag + tenancy
## 
## Fitted party:
## [1] root
## |   [2] sex <= 1: 16.753 (n = 12964, err = 17534.4)
## |   [3] sex > 1
## |   |   [4] father_occup <= 7: 16.584 (n = 9419, err = 10565.6)
## |   |   [5] father_occup > 7: 16.623 (n = 3157, err = 3852.8)
## 
## Number of inner nodes:    2
## Number of terminal nodes: 3
plot(IT_Ctree,gp = gpar(fontsize = 6),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Italy 2011")

IT_cctree <- train(formula, data = IT_train, method = "ctree", trControl = fitControl, na.action = na.pass)

IT_cctree #suggests using mincriterion 0.99
## Conditional Inference Tree 
## 
## 25540 samples
##    17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 22986, 22987, 22985, 22986, 22985, 22986, ... 
## Resampling results across tuning parameters:
## 
##   mincriterion  RMSE      Rsquared     MAE      
##   0.01          1.124577  0.001877554  0.8663355
##   0.50          1.118683  0.005242715  0.8606371
##   0.99          1.118639  0.005288233  0.8601645
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
plot(IT_cctree$finalModel)

#plotted as ctree
IT_cct <- ctree(formula, data = IT_train, mincriterion = 0.99)

plot(IT_cct,gp = gpar(fontsize = 8),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Opportunity Conditional Inference Tree for Italy 2011 - Cross Validated")

#In Italy we have too many NAs among the circumstantial 
IT_test$P_Ct <- predict(IT_cct, newdata = as.data.frame(IT_test))

IT_test$perror <- (IT_test$P_Ct - IT_test$log_income)^2

IT_test$RMSE <- sqrt(sum((IT_test$P_Ct - IT_test$log_income)^2/nrow(IT_test), na.rm = T))
# The Denmark set has too many missing values, we cannot evaluate it with the given variables

DK_cctree <- train(formula, data = DK_train, method = "ctree", trControl = fitControl, na.action = na.omit)
## Error: Every row has at least one missing value were found
DK_cctree
## Error in eval(expr, envir, enclos): Objekt 'DK_cctree' nicht gefunden
# The Finland set has too many missing values
FI_cctree <- train(formula, data = FI_train, method = "ctree", trControl = fitControl, na.action = na.omit)
## Error: Every row has at least one missing value were found
FI_cctree
## Error in eval(expr, envir, enclos): Objekt 'FI_cctree' nicht gefunden
LV_Ctree <- ctree(formula, data = LV_train)
LV_Ctree
## 
## Model formula:
## log_income ~ sex + parents_present + adults_home + children_home + 
##     father_cob + father_cit + mother_cob + mother_cit + father_edu + 
##     mother_edu + father_occup_stat + mother_occup_stat + father_occup + 
##     mother_occup + father_manag + mother_manag + tenancy
## 
## Fitted party:
## [1] root
## |   [2] mother_occup <= 4
## |   |   [3] mother_edu <= 2: 9.210 (n = 1849, err = 3313.9)
## |   |   [4] mother_edu > 2: 9.387 (n = 369, err = 638.9)
## |   [5] mother_occup > 4
## |   |   [6] mother_edu <= 1: 9.060 (n = 1099, err = 2396.2)
## |   |   [7] mother_edu > 1
## |   |   |   [8] sex <= 1: 9.190 (n = 712, err = 1905.0)
## |   |   |   [9] sex > 1: 9.106 (n = 822, err = 1624.1)
## 
## Number of inner nodes:    4
## Number of terminal nodes: 5
plot(LV_Ctree,gp = gpar(fontsize = 8),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Latvia 2011")

LV_cctree <- train(formula, data = LV_train, method = "ctree", trControl = fitControl, na.action = na.pass)
LV_cctree #again we choose Mincriterion 0.99 based on the RMSE
## Conditional Inference Tree 
## 
## 4851 samples
##   17 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 4366, 4366, 4366, 4366, 4364, 4366, ... 
## Resampling results across tuning parameters:
## 
##   mincriterion  RMSE      Rsquared     MAE      
##   0.01          1.442565  0.006763641  0.8731092
##   0.50          1.424655  0.008741515  0.8512662
##   0.99          1.419613  0.009915048  0.8463406
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.99.
plot(LV_cctree$finalModel)

# we do the control step using the default ctree_control function
LV_cct <- ctree(formula, data = LV_train, control = ctree_control(testtype = "Bonferroni", mincriterion = 0.95))


plot(LV_cct,gp = gpar(fontsize = 8),
  inner_panel=node_inner,
  ip_args=list(abbreviate = FALSE,id = FALSE), main = "Conditional Inference Tree for Latvia 2011 - Cross Validated")

LV_test$P_Ct <- predict(LV_cct, newdata = as.data.frame(LV_test))

LV_test$perror <- (LV_test$P_Ct - LV_test$log_income)^2

LV_test$RMSE <- sqrt(sum((LV_test$P_Ct - LV_test$log_income)^2/nrow(LV_test), na.rm = T))

#RMSE of 1.4 which is not so good, and does not speak of good predictive capabilities of the model

Conditional Forest

AT_cf <- cforest(formula, AT_equality_data, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))

AThat_cf <- predict(AT_cf, newdata = AT_test, OOB = TRUE, type = "response")

varimp(AT_cf, mincriterion = 0, OOB = TRUE) 
##               sex   parents_present       adults_home     children_home 
##      0.0188871224     -0.0009683862      0.0030389690      0.0008731912 
##        father_cob        father_cit        mother_cob        mother_cit 
##      0.0038475944      0.0039683794      0.0086866878      0.0053998440 
##        father_edu        mother_edu mother_occup_stat      father_occup 
##      0.0007573548      0.0031400169     -0.0072261529     -0.0001711782 
##      father_manag      mother_manag 
##     -0.0086178882      0.0060585751
importance_cf <- data.frame(varimp(AT_cf, mincriterion = 0, OOB = TRUE))
names(importance_cf) <- "importance"
importance_cf$var_name = rownames(importance_cf)
importance_cf <- importance_cf  %>% 
  arrange( desc(importance))  %>%
  mutate(Country = "AT")
ggplot(importance_cf, aes(x = var_name, y = importance)) +
    geom_point() +
    scale_x_discrete(limits = importance_cf$var_name[order(importance_cf$importance)]) +
    labs(title = "Conditional Forest variable importance - Austria 2011", x = "", y = "Mean decrease in sum of squared residuals") +
    coord_flip() +
    theme(axis.text.y = element_text(hjust = 0))

FR_cf <- cforest(formula, FR_equality_data, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))
importance_cf_FR <- data.frame(varimp(FR_cf, mincriterion = 0, OOB = TRUE))
names(importance_cf_FR) <- "importance"
importance_cf_FR$var_name = rownames(importance_cf_FR)
importance_cf_FR <- importance_cf_FR  %>% arrange(desc(importance)) %>% mutate(Country = "FR")
IT_cf <- cforest(formula, IT_equality_data, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))

importance_cf_IT <- data.frame(varimp(IT_cf, mincriterion = 0, OOB = TRUE))
names(importance_cf_IT) <- "importance"
importance_cf_IT$var_name = rownames(importance_cf_IT)
importance_cf_IT <- importance_cf_IT  %>% arrange(desc(importance)) %>% mutate(Country = "IT")
LV_cf <- cforest(formula, LV_equality_data, na.action = na.pass, control = ctree_control(teststat = "quadratic", testtype = "Bonferroni", mincriterion = 0.99), ytrafo = NULL, scores = NULL, ntree = 500L, perturb = list(replace = FALSE, fraction = 0.8))

importance_cf_LV <- data.frame(varimp(LV_cf, mincriterion = 0, OOB = TRUE))
names(importance_cf_LV) <- "importance"
importance_cf_LV$var_name = rownames(importance_cf_LV)
importance_cf_LV <- importance_cf_LV  %>% arrange(desc(importance)) %>% mutate(Country = "LV")
df <- full_join(importance_cf, importance_cf_FR)
## Joining, by = c("importance", "var_name", "Country")
#df <- full_join(df, importance_cf_IT)
df <- full_join(df, importance_cf_LV) %>% group_by(Country)
## Joining, by = c("importance", "var_name", "Country")
ggplot(df, aes(x = var_name , y = importance, shape = Country)) +
    geom_point() +
    scale_x_discrete(limits = importance_cf_FR$var_name[order(importance_cf_FR$importance)]) +
    labs(title = "Conditional Forest variable importance - Country Comparison", x = "", y = "Variable importance") +theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(importance_cf_FR, aes(x = var_name, y = importance)) +
    geom_point() +
    scale_x_discrete(limits = importance_cf_FR$var_name[order(importance_cf_FR$importance)]) +
    labs(title = "Conditional Forest variable importance - France 2011", x = "", y = "Mean decrease in sum of squared residuals") +
    coord_flip() +
    theme(axis.text.y = element_text(hjust = 0))

ggplot(importance_cf_IT, aes(x = var_name, y = importance)) +
    geom_point() +
    scale_x_discrete(limits = importance_cf_IT$var_name[order(importance_cf_IT$importance)]) +
    labs(title = "Conditional Forest variable importance - Italy 2011", x = "", y = "Mean decrease in sum of squared residuals") +
    coord_flip() +
    theme(axis.text.y = element_text(hjust = 0))

ggplot(importance_cf_LV, aes(x = var_name, y = importance)) +
    geom_point() +
    scale_x_discrete(limits = importance_cf_LV$var_name[order(importance_cf_LV$importance)]) +
    labs(title = "Conditional Forest variable importance - Latvia 2011", x = "", y = "Mean decrease in sum of squared residuals") +
    coord_flip() +
    theme(axis.text.y = element_text(hjust = 0))

# Conclusion

Conclusion

References